def generateUserProfiles(R, d, seed, sparkContext, N): """ Generate the user profiles from rdd R and store them in an RDD containing tuples of the form (i,ui) where u is a random np.array of dimension d. The random uis are generated using normalVectorRDD(), a function in RandomRDDs. Inputs are: - R: an RDD that contains the ratings in (user, item, rating) form - d: the dimension of the user profiles - seed: a seed to be used for in generating the random vectors - sparkContext: a spark context - N: the number of partitions to be used during joins, etc. The return value is an RDD containing the user profiles """ # exctract user ids U = R.map(lambda inp: inp[0]).distinct(numPartitions=N) numUsers = U.count() randRDD = RandomRDDs.normalVectorRDD(sparkContext, numUsers, d, numPartitions=N, seed=seed) U = U.zipWithIndex().map(swap) randRDD = randRDD.zipWithIndex().map(swap) return U.join(randRDD, numPartitions=N).values()
def generateItemProfiles(R, d, seed, sparkContext, N): """ Generate the item profiles from rdd R and store them in an RDD containing tuples of the form (j,vj) where v is a random np.array of dimension d. The random uis are generated using normalVectorRDD(), a function in RandomRDDs. Inputs are: - R: an RDD that contains the ratings in (user, item, rating) form - d: the dimension of the user profiles - seed: a seed to be used for in generating the random vectors - sparkContext: a spark context - N: the number of partitions to be used during joins, etc. The return value is an RDD containing the item profiles """ V = R.map(lambda (i, j, rij): j).distinct(numPartitions=N) numItems = V.count() randRDD = RandomRDDs.normalVectorRDD(sparkContext, numItems, d, numPartitions=N, seed=seed) V = V.zipWithIndex().map(swap) randRDD = randRDD.zipWithIndex().map(swap) return V.join(randRDD, numPartitions=N).values()
def test_col_norms(self): data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) summary = Statistics.colStats(data) self.assertEqual(10, len(summary.normL1())) self.assertEqual(10, len(summary.normL2())) data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x)) summary2 = Statistics.colStats(data2) self.assertEqual(array([45.0]), summary2.normL1()) import math expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10)))) self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
def test_col_norms(self): data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) summary = Statistics.colStats(data) self.assertEqual(10, len(summary.normL1())) self.assertEqual(10, len(summary.normL2())) data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x)) summary2 = Statistics.colStats(data2) self.assertEqual(array([45.0]), summary2.normL1()) import math expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10)))) self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
def test_col_with_different_rdds(self): # numpy data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) summary = Statistics.colStats(data) self.assertEqual(1000, summary.count()) # array data = self.sc.parallelize([range(10)] * 10) summary = Statistics.colStats(data) self.assertEqual(10, summary.count()) # array data = self.sc.parallelize([pyarray.array("d", range(10))] * 10) summary = Statistics.colStats(data) self.assertEqual(10, summary.count())
def test_col_with_different_rdds(self): # numpy data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) summary = Statistics.colStats(data) self.assertEqual(1000, summary.count()) # array data = self.sc.parallelize([range(10)] * 10) summary = Statistics.colStats(data) self.assertEqual(10, summary.count()) # array data = self.sc.parallelize([pyarray.array("d", range(10))] * 10) summary = Statistics.colStats(data) self.assertEqual(10, summary.count())
from pyspark.mllib.random import RandomRDDs from pyspark.sql.types import * #from pyspark.sql.functions import * from pyspark.sql.types import Row spark = SparkSession.builder.config("spark.sql.crossJoin.enabled", "true").getOrCreate() n = 500 # create rdd of random floats nRow = n nCol = 4 seed = 5 numPartitions = 32 rdd1 = RandomRDDs.normalVectorRDD(spark, nRow, nCol, numPartitions, seed) seed = 3 rdd2 = RandomRDDs.normalVectorRDD(spark, nRow, nCol, numPartitions, seed) sc = spark.sparkContext # convert each tuple in the rdd to a row randomNumberRdd1 = rdd1.map( lambda x: Row(A=float(x[0]), B=float(x[1]), C=float(x[2]), D=float(x[3]))) randomNumberRdd2 = rdd2.map( lambda x: Row(E=float(x[0]), F=float(x[1]), G=float(x[2]), H=float(x[3]))) # create dataframe from rdd schemaRandomNumberDF1 = spark.createDataFrame(randomNumberRdd1) schemaRandomNumberDF2 = spark.createDataFrame(randomNumberRdd2) # cache the dataframe
def generate_random_vector(sc, path, numRows, numCols, partitions, standard_deviation): normalRDD = RandomRDDs.normalVectorRDD(sc, numRows, numCols, partitions, seed=1)\ .map(lambda l: 10 + standard_deviation * l)\ .map(lambda l: [l[0], l[1], l[2], l[3], random.randint(0, partitions - 1)]) return normalRDD
""" Simple distributed implementation of the K-Means algorithm using Tensorflow. """ import tensorflow as tf import tensorframes as tfs from pyspark.mllib.random import RandomRDDs import numpy as np num_features = 4 k = 2 # TODO: does not work with 1 data = RandomRDDs.normalVectorRDD( sc, numCols=num_features, numRows=100, seed=1).map(lambda v: [v.tolist()]) df = sqlContext.createDataFrame(data).toDF("features") # For now, analysis is still required. df0 = tfs.analyze(df) init_centers = np.random.randn(k, num_features) # For debugging block = np.array(data.take(10))[::,0,::] # Find the distances first with tf.Graph().as_default() as g: points = tf.placeholder(tf.double, shape=[None, num_features], name='points') num_points = tf.shape(points)[0] #centers = tf.placeholder(tf.double, shape=[k, num_features], name='centers')
def construct_hyperplanes(num_hp_arrangements, num_hp_per_arrangement, ambient_dimension): num_hps = num_hp_arrangements * num_hp_per_arrangement all_hp_rdd = RandomRDDs.normalVectorRDD(sc, num_hps, ambient_dimension) return np.matrix(all_hp_rdd.collect())
""" Simple distributed implementation of the K-Means algorithm using Tensorflow. """ import tensorflow as tf import tensorframes as tfs from pyspark.mllib.random import RandomRDDs import numpy as np num_features = 4 k = 2 # TODO: does not work with 1 data = RandomRDDs.normalVectorRDD(sc, numCols=num_features, numRows=100, seed=1).map(lambda v: [v.tolist()]) df = sqlContext.createDataFrame(data).toDF("features") # For now, analysis is still required. df0 = tfs.analyze(df) init_centers = np.random.randn(k, num_features) # For debugging block = np.array(data.take(10))[::, 0, ::] # Find the distances first with tf.Graph().as_default() as g: points = tf.placeholder(tf.double, shape=[None, num_features], name='points') num_points = tf.shape(points)[0]
def build_scenarios(self): nb_timesteps = self.timesteps.size return RandomRDDs.normalVectorRDD(sc, self.nb_scenarios, nb_timesteps, seed=1)
def test_col_norms(self): data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) summary = Statistics.colStats(data) self.assertEqual(10, len(summary.normL1())) self.assertEqual(10, len(summary.normL2()))
if __name__ == "__main__": if len(sys.argv) not in [1, 2]: print("Usage: random_rdd_generation", file=sys.stderr) sys.exit(-1) sc = SparkContext(appName="PythonRandomRDDGeneration") numExamples = 10000 # number of examples to generate fraction = 0.1 # fraction of data to sample # Example: RandomRDDs.normalRDD normalRDD = RandomRDDs.normalRDD(sc, numExamples) print('Generated RDD of %d examples sampled from the standard normal distribution' % normalRDD.count()) print(' First 5 samples:') for sample in normalRDD.take(5): print(' ' + str(sample)) print() # Example: RandomRDDs.normalVectorRDD normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows=numExamples, numCols=2) print('Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count()) print(' First 5 samples:') for sample in normalVectorRDD.take(5): print(' ' + str(sample)) print() sc.stop()
#!/usr/bin/env python # coding: utf-8 # In[65]: import numpy as np from pyspark.mllib.random import RandomRDDs from pyspark.mllib.clustering import KMeans, KMeansModel # In[66]: c1_v = RandomRDDs.normalVectorRDD(sc, 20, 2, numPartitions=2, seed=1).map(lambda v: np.add([1, 5], v)) # In[67]: c1_v.stats() # In[68]: c2_v = RandomRDDs.normalVectorRDD(sc, 20, 2, numPartitions=2, seed=1).map(lambda v: np.add([5, 1], v)) # In[69]: c2_v.stats() # In[70]: c3_v = RandomRDDs.normalVectorRDD(sc, 20, 2, numPartitions=2, seed=1).map(lambda v: np.add([4, 6], v))
#A script to execute kmeans clustering in spark #to run enter: >>> exec(open("./dokmeans.py").read()) import numpy as np from pyspark.mllib.linalg import Vectors from pyspark.mllib.clustering import KMeans #generate random data RDD we need this package from pyspark.mllib.random import RandomRDDs #let's generate random class data, add in a cluster center to random 2D points #use default num of partitions, or use a definte number to make it so that the union # will have samples across clusters c1_v=RandomRDDs.normalVectorRDD(sc,20,2,numPartitions=2,seed=1L).map(lambda v:np.add([1,5],v)) c2_v=RandomRDDs.normalVectorRDD(sc,16,2,numPartitions=2,seed=2L).map(lambda v:np.add([5,1],v)) c3_v=RandomRDDs.normalVectorRDD(sc,12,2,numPartitions=2,seed=3L).map(lambda v:np.add([4,6],v)) #concatenate 2 RDDs with .union(other) function c12 =c1_v.union(c2_v) my_data=c12.union(c3_v) #this now has all points, as RDD my_kmmodel = KMeans.train(my_data,k=1, maxIterations=20,runs=1, initializationMode='k-means||',seed=10L) #try: help(KMeans.train) to see parameter options #k is the number of desired clusters. #maxIterations is the maximum number of iterations to run.
pts = int(30) # number of pts to be generated k = int(3) # number of rdd dim = int(4) # dim of the data for i in range(1, 100): dev = int(i) file_name = "out_dev_" + str(i) + '.csv' rdd = sc.parallelize(range(0, k)) clust_mean = rdd.map(lambda cluster: ( cluster, random.sample(list(numpy.arange(val_min, val_max, ETAPES)), dim))) valeurs_vector_alea = RandomRDDs.normalVectorRDD(sc, numRows=pts, numCols=dim, numPartitions=k, seed=1) # assiging a random cluster for each point cluster_valeur_normales_vector = valeurs_vector_alea.map( lambda point: (random.randint(0, k - 1), point.tolist())) # generate a valeur depending of the mean of the cluster, standard deviation and the normal valeur pts_valeur_vector = cluster_valeur_normales_vector.join(clust_mean).map( lambda x: (point_valeurs(x[1][1], x[1][0], dev, x[0], dim))) #Voir le resultat print(pts_valeur_vector.collect()) # writing pts valeur in a 1 csv file # write_into_csv(file_name, pts_valeur_vector); # saving rdd using saveAsTextFile pts_valeur_vector.saveAsTextFile(file_name)
count_cluster = int(sys.argv[3]) # number of clusters dimension = int(sys.argv[4]) # dimension of the data std = int(sys.argv[5]) # standard deviation noise_points = points * 2 # number of noise points to be generated / double the number of points file_name_noise = sys.argv[1] + '-noise.csv' # file name for noise points to be generated sc = SparkContext("local", "generator") # spark context # array of the clusters : clusters = [0, 1, 2] clusters = sc.parallelize(range(0, count_cluster)) # random means of each cluster : means_cluster = [ (0, [0.6, 80.9]), (1, [57.8, 20.2]), (2, [15.6, 49.9]) ] means_cluster = clusters.map(lambda cluster : (cluster, random.sample(numpy.arange(MIN_MEAN_VALUE, MAX_MEAN_VALUE, STEPS), dimension))) # creating random vector using normalVectorRDD random_values_vector = RandomRDDs.normalVectorRDD(sc, numRows = points, numCols = dimension, numPartitions = count_cluster, seed = 1L) # assiging a random cluster for each point cluster_normal_values_vector = random_values_vector.map(lambda point : (random.randint(0, count_cluster - 1), point.tolist())) # generate a value depending of the mean of the cluster, standard deviation and the normal value points_value_vector = cluster_normal_values_vector.join(means_cluster).map(lambda (cluster, (normal_value, means_value)): (point_values(means_value, normal_value, std, cluster, dimension))) print(points_value_vector.collect()) # generate random points that represent noise points noise_points_vector = sc.parallelize(range(0, noise_points)).map(lambda x : random.sample(numpy.arange(MIN_MEAN_VALUE, MAX_MEAN_VALUE, STEPS), dimension)).map(lambda v: noise_values(v)) # noise_points_vector = noise_points_vector.map(lambda row : str(row).replace("[", "").replace("]","")) print(noise_points_vector.collect())
def test_col_norms(self): data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) summary = Statistics.colStats(data) self.assertEqual(10, len(summary.normL1())) self.assertEqual(10, len(summary.normL2()))