def housing_data_generator(start=0, n_points=1000000, n_jobs=4): # step = int(math.ceil(n_points/n_jobs)) # Parallel(n_jobs=n_jobs)(delayed(house_data)(i, i+step) for i in range(start, n_points, step)) from pyspark.mllib.random import RandomRDDs from pyspark.sql import SparkSession from pyspark.sql import DataFrame from pyspark.sql import SQLContext, Row import math scSpark = SparkSession \ .builder \ .appName("reading csv") \ .getOrCreate() columns = ["id", "size", "loc", "rooms", "bathrooms", "year", "price"] u = RandomRDDs.uniformRDD(scSpark, n_points, 2).map(lambda x: math.ceil( house_size[0] + (house_size[-1] - house_size[0]) * x)).zipWithIndex() u = scSpark.createDataFrame(u, ["size", "id"]) for col in columns[2:]: v = RandomRDDs.uniformRDD( scSpark, n_points, 2).map(lambda x: math.ceil(house_size[0] + ( house_size[-1] - house_size[0]) * x)).zipWithIndex() v = scSpark.createDataFrame(v, [col, "id"]) u = u.join(v, "id").select("*") u.show()
def generateItemProfiles(R, d, seed, sparkContext, N): """ Generate the item profiles from rdd R and store them in an RDD containing tuples of the form (j,vj) where v is a random np.array of dimension d. The random uis are generated using normalVectorRDD(), a function in RandomRDDs. Inputs are: - R: an RDD that contains the ratings in (user, item, rating) form - d: the dimension of the user profiles - seed: a seed to be used for in generating the random vectors - sparkContext: a spark context - N: the number of partitions to be used during joins, etc. The return value is an RDD containing the item profiles """ V = R.map(lambda (i, j, rij): j).distinct(numPartitions=N) numItems = V.count() randRDD = RandomRDDs.normalVectorRDD(sparkContext, numItems, d, numPartitions=N, seed=seed) V = V.zipWithIndex().map(swap) randRDD = randRDD.zipWithIndex().map(swap) return V.join(randRDD, numPartitions=N).values()
def generateUserProfiles(R, d, seed, sparkContext, N): """ Generate the user profiles from rdd R and store them in an RDD containing tuples of the form (i,ui) where u is a random np.array of dimension d. The random uis are generated using normalVectorRDD(), a function in RandomRDDs. Inputs are: - R: an RDD that contains the ratings in (user, item, rating) form - d: the dimension of the user profiles - seed: a seed to be used for in generating the random vectors - sparkContext: a spark context - N: the number of partitions to be used during joins, etc. The return value is an RDD containing the user profiles """ # exctract user ids U = R.map(lambda inp: inp[0]).distinct(numPartitions=N) numUsers = U.count() randRDD = RandomRDDs.normalVectorRDD(sparkContext, numUsers, d, numPartitions=N, seed=seed) U = U.zipWithIndex().map(swap) randRDD = randRDD.zipWithIndex().map(swap) return U.join(randRDD, numPartitions=N).values()
def gen_hash_coeffs_1(num_buckets, num_bigrams, seed): coeff = [0 for j in xrange(num_buckets)] for i in xrange(num_buckets): #nRDD= RandomRDDs.normalRDD(sc, num_bigrams, seed=seed+i).map(lambda val: str(-1) if val < 0 else str(1)).collect() #nRDD= RandomRDDs.uniformRDD(sc, num_bigrams,0,seed+i).map(lambda val: str(-1) if val <= 0.5 else str(1)).collect() #Str to float nRDD = RandomRDDs.uniformRDD(sc, num_bigrams, 0, seed + i).map( lambda val: float(-1) if val <= 0.5 else float(1)).collect() j = nRDD coeff[i] = j return coeff
def test_col_norms(self): data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) summary = Statistics.colStats(data) self.assertEqual(10, len(summary.normL1())) self.assertEqual(10, len(summary.normL2())) data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x)) summary2 = Statistics.colStats(data2) self.assertEqual(array([45.0]), summary2.normL1()) import math expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10)))) self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
def find_best_params(X, y): spark, sc = __start_session() param_size = 100 sample_size = 500 parallelism = sc.defaultParallelism train_numpy = np.concatenate((X, y[:, np.newaxis]), axis=1) train = sc.parallelize(train_numpy) \ .map(lambda r: [Vectors.dense(r[0]),float(r[1])]) \ .toDF(['features','label']) \ .repartition(parallelism) \ .cache() reg_param = RandomRDDs.uniformRDD(sc, size=param_size, seed=settings.seed) \ .map(lambda x: 0.001 + (0.1 - 0.001) * x) \ .collect() max_iter = RandomRDDs.uniformRDD(sc, size=param_size, seed=settings.seed) \ .map(lambda x: int(5 + (20 - 5) * x)) \ .collect() # create random grid estimator = LinearRegression(solver='normal') param_grid = ParamGridBuilder().addGrid(estimator.regParam, reg_param) \ .addGrid(estimator.maxIter, max_iter) \ .build() param_grid = sc.parallelize(param_grid) \ .takeSample(withReplacement=False, num=sample_size, seed=settings.seed) best_params = __run_search(estimator, param_grid, train, parallelism) train.unpersist() spark.stop() # print results print('Best Params:', best_params) return best_params
def test_col_with_different_rdds(self): # numpy data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) summary = Statistics.colStats(data) self.assertEqual(1000, summary.count()) # array data = self.sc.parallelize([range(10)] * 10) summary = Statistics.colStats(data) self.assertEqual(10, summary.count()) # array data = self.sc.parallelize([pyarray.array("d", range(10))] * 10) summary = Statistics.colStats(data) self.assertEqual(10, summary.count())
def generator_normal_rdd(sc, n, d, s, k): """ :param n: Nombre de lignes (de données) :param d: Dimension des données (lignes) :param s: Ecart type :param k: Clusters :return: Liste contenant les rdds de points associés à chaque cluster """ normal_rdds = [] print("LISTE DONNEES : \n") for cluster, mean in mean_cluster(k).items(): normal_rdd = RandomRDDs.logNormalVectorRDD( sc=sc, mean=mean, std=s, numRows=int(n / k), numCols=d, seed=1).map(lambda x: (list(x), cluster)) print(normal_rdd.collect()) normal_rdds.append(normal_rdd) print() return normal_rdds
def main(): if len(sys.argv) == 2: length = sys.argv[1] else: length = 10**3 print("26") data0_1 = RandomRDDs.uniformVectorRDD(sc, length, 3) \ .map(lambda a : a.round(3).tolist()) \ .toDF() print("30") name = "random_data{}.parquet".format(333) # random.randrange(200)) print("using name=" + name) data0_1.write.parquet(name) print("33") read_df = spark.read.parquet(name) # print(f"Read {read_df.count()} records. Should be {length} records.") # for python 3 print("Read {} records. Should be {} records.".format( read_df.count(), length)) # for python 3
""" Simple distributed implementation of the K-Means algorithm using Tensorflow. """ import tensorflow as tf import tensorframes as tfs from pyspark.mllib.random import RandomRDDs import numpy as np num_features = 4 k = 2 # TODO: does not work with 1 data = RandomRDDs.normalVectorRDD( sc, numCols=num_features, numRows=100, seed=1).map(lambda v: [v.tolist()]) df = sqlContext.createDataFrame(data).toDF("features") # For now, analysis is still required. df0 = tfs.analyze(df) init_centers = np.random.randn(k, num_features) # For debugging block = np.array(data.take(10))[::,0,::] # Find the distances first with tf.Graph().as_default() as g: points = tf.placeholder(tf.double, shape=[None, num_features], name='points') num_points = tf.shape(points)[0] #centers = tf.placeholder(tf.double, shape=[k, num_features], name='centers')
import json from IPython.display import Javascript, HTML # "Import" d3. This adds d3 v5 to the output window iframe. HTML("<script src='https://d3js.org/d3.v5.min.js'></script>") # Create the spark context conf = SparkConf().setAppName("strata_distributions") sc = SparkContext(conf=conf) # Generate a 10000 random number RDD from that follows a normal # distribution. x = RandomRDDs.normalRDD(sc, 10000, seed=1) x.take(10) # The code below creates a histogram with 500 bins from the random number # RDD. This then converted to a json string using `json.dumps` and the # output is pushed a the `hist` variable in the browser using the # `Javascript`. This is then accessible as input data for d3. Javascript("window.hist = {}".format(json.dumps(x.histogram(500)[1]))) # Create an svg element in the output window to use for the first # impelemenation HTML(""" <div id='div_svg'><svg id='svg_hist' width='500' height='200'></svg></div> """)
""" Simple distributed implementation of the K-Means algorithm using Tensorflow. """ import tensorflow as tf import tensorframes as tfs from pyspark.mllib.random import RandomRDDs import numpy as np num_features = 4 k = 2 # TODO: does not work with 1 data = RandomRDDs.normalVectorRDD(sc, numCols=num_features, numRows=100, seed=1).map(lambda v: [v.tolist()]) df = sqlContext.createDataFrame(data).toDF("features") # For now, analysis is still required. df0 = tfs.analyze(df) init_centers = np.random.randn(k, num_features) # For debugging block = np.array(data.take(10))[::, 0, ::] # Find the distances first with tf.Graph().as_default() as g: points = tf.placeholder(tf.double, shape=[None, num_features], name='points') num_points = tf.shape(points)[0]
""" Testing with Random data generation https://spark.apache.org/docs/latest/mllib-statistics.html """ from pyspark.mllib.random import RandomRDDs from pyspark import SparkContext sc = SparkContext("local", "Rubbish") # Generate a random double RDD that contains 1 million i.i.d. values drawn from the # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. u = RandomRDDs.uniformRDD(sc, 1000000L, 10) # Apply a transform to get a random double RDD following `N(1, 4)`. v = u.map(lambda x: 1.0 + 2.0 * x) print v
def build_scenarios(self): nb_timesteps = self.timesteps.size return RandomRDDs.normalVectorRDD(sc, self.nb_scenarios, nb_timesteps, seed=1)
print(goodnessOfFitTestResults) # pearson's independence test on a matrix mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) independenceTestResults = Statistics.chiSqTest(mat) print(independenceTestResults) # a contingency table can be constructed from an RDD of LabeledPoint/vector pairs. The resulting test returns # a Chi-squared test results for every feature against the label obs = sc.parallelize([ LabeledPoint(1.0, [1.0, 0.0, 3.0]), LabeledPoint(1.0, [1.0, 2.0, 0.0]), LabeledPoint(1.0, [-1.0, 0.0, -0.5]) ]) featureTestResults = Statistics.chiSqTest(obs) for i, result in enumerate(featureTestResults): print('column {0}: \n {1}'.format(i, result)) ## random data generation from pyspark.mllib.random import RandomRDDs # generate a random RDD that contains a million iid values drawn from a normal distribution N(0, 1) # distribute evenly to 10 partitions u = RandomRDDs.normalRDD(sc, size=1000000, numPartitions=10) print(u.take(20)) # apply a transformation to return a random RDD that follow a normal distribution N(1, 4) v = u.map(lambda x: 1.0 + 2.0 * x) print(v.take(20))
import sys from pyspark import SparkContext from pyspark.mllib.random import RandomRDDs from math import hypot def dist(p): return hypot(p[0] - 0.5, p[1] - 0.5) sc = SparkContext("local", "Monte Carlo Integration Pi Approximation") num_samples = int(sys.argv[1]) a = RandomRDDs.uniformVectorRDD(sc, num_samples, 2) num = a.map(dist).filter(lambda d: d < 0.5).count() print(4 * num / num_samples)
from pyspark.mllib.random import RandomRDDs from pyspark import SparkContext from pyspark.sql import Row, SparkSession import pyspark.sql.functions as f from pyspark.sql.functions import coalesce sc = SparkContext() spark = SparkSession(sc) Rdd = RandomRDDs.uniformRDD(sc, 100044, 2) Rdd2 = RandomRDDs.uniformRDD(sc, 100044, 2) Rdd_cons = Rdd.map(lambda x: 102.83547008547009 + 102.85047727 * x) Rdd_cons = Rdd_cons.sortBy(lambda x: x) Rdd_pop = Rdd2.map(lambda x: 3401 + 150000 * x) Rdd_pop = Rdd_pop.sortBy(lambda x: x) Rdd_pop = Rdd_pop.map(lambda x: int(x + 6071639)) mois = [] for i in range(100044): mois.append(i + 1) Rdd_mois = sc.parallelize(mois, 2) colone1 = Row("consomation") colone2 = Row("population") colone3 = Row("mois") df_cons = Rdd_cons.map(colone1).toDF() df_pop = Rdd_pop.map(colone2).toDF() df_mois = Rdd_mois.map(colone3).toDF() df_mois = df_mois.withColumn('ligne_id', f.monotonically_increasing_id()) df_pop = df_pop.withColumn('ligne_id', f.monotonically_increasing_id()) df_cons = df_cons.withColumn('ligne_id', f.monotonically_increasing_id()) df = df_mois.join(df_pop, on=["ligne_id"]).sort("ligne_id") df = df.join(df_cons, on=["ligne_id"]).sort("ligne_id")
def generate_random_uniform_df(nrows, ncols): df = RandomRDDs.uniformVectorRDD(spark.sparkContext, nrows, ncols).map(lambda a: a.tolist()).toDF() return df
from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.mllib.random import RandomRDDs from time import time print("########################################") print("STARTING") print("########################################") sc = SparkContext(appName="speedtest-nrb") sql_context = SQLContext(sc) start = time() t = time() print("creating dataframes df and df2") df = RandomRDDs.uniformVectorRDD(sc, 100000000, 2).map(lambda a: a.tolist()).toDF() df2 = RandomRDDs.uniformVectorRDD(sc, 100000000, 2).map(lambda a: a.tolist()).toDF() print(time() - t) t = time() print("counting df") df.count() print(time() - t) t = time() print("counting df") df.count() print(time() - t) t = time()
# (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from pyspark.mllib.random import RandomRDDs from pyspark import SparkContext # $example on$ from pyspark.mllib.linalg import Matrices, Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics # $example off$ if __name__ == "__main__": sc = SparkContext(appName="RandomDataGenerationExample") # Generate a random double RDD that contains 1 million i.i.d. values drawn from the # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. u = RandomRDDs.normalRDD(sc, 1000000, 10) # Apply a transform to get a random double RDD following `N(1, 4)`. v = u.map(lambda x: 1.0 + 2.0 * x) print(v)
#!/usr/bin/env python # coding: utf-8 # In[65]: import numpy as np from pyspark.mllib.random import RandomRDDs from pyspark.mllib.clustering import KMeans, KMeansModel # In[66]: c1_v = RandomRDDs.normalVectorRDD(sc, 20, 2, numPartitions=2, seed=1).map(lambda v: np.add([1, 5], v)) # In[67]: c1_v.stats() # In[68]: c2_v = RandomRDDs.normalVectorRDD(sc, 20, 2, numPartitions=2, seed=1).map(lambda v: np.add([5, 1], v)) # In[69]: c2_v.stats() # In[70]: c3_v = RandomRDDs.normalVectorRDD(sc, 20, 2, numPartitions=2, seed=1).map(lambda v: np.add([4, 6], v))
pts = int(30) # number of pts to be generated k = int(3) # number of rdd dim = int(4) # dim of the data for i in range(1, 100): dev = int(i) file_name = "out_dev_" + str(i) + '.csv' rdd = sc.parallelize(range(0, k)) clust_mean = rdd.map(lambda cluster: ( cluster, random.sample(list(numpy.arange(val_min, val_max, ETAPES)), dim))) valeurs_vector_alea = RandomRDDs.normalVectorRDD(sc, numRows=pts, numCols=dim, numPartitions=k, seed=1) # assiging a random cluster for each point cluster_valeur_normales_vector = valeurs_vector_alea.map( lambda point: (random.randint(0, k - 1), point.tolist())) # generate a valeur depending of the mean of the cluster, standard deviation and the normal valeur pts_valeur_vector = cluster_valeur_normales_vector.join(clust_mean).map( lambda x: (point_valeurs(x[1][1], x[1][0], dev, x[0], dim))) #Voir le resultat print(pts_valeur_vector.collect()) # writing pts valeur in a 1 csv file # write_into_csv(file_name, pts_valeur_vector); # saving rdd using saveAsTextFile pts_valeur_vector.saveAsTextFile(file_name)
#A script to execute kmeans clustering in spark #to run enter: >>> exec(open("./dokmeans.py").read()) import numpy as np from pyspark.mllib.linalg import Vectors from pyspark.mllib.clustering import KMeans #generate random data RDD we need this package from pyspark.mllib.random import RandomRDDs #let's generate random class data, add in a cluster center to random 2D points #use default num of partitions, or use a definte number to make it so that the union # will have samples across clusters c1_v=RandomRDDs.normalVectorRDD(sc,20,2,numPartitions=2,seed=1L).map(lambda v:np.add([1,5],v)) c2_v=RandomRDDs.normalVectorRDD(sc,16,2,numPartitions=2,seed=2L).map(lambda v:np.add([5,1],v)) c3_v=RandomRDDs.normalVectorRDD(sc,12,2,numPartitions=2,seed=3L).map(lambda v:np.add([4,6],v)) #concatenate 2 RDDs with .union(other) function c12 =c1_v.union(c2_v) my_data=c12.union(c3_v) #this now has all points, as RDD my_kmmodel = KMeans.train(my_data,k=1, maxIterations=20,runs=1, initializationMode='k-means||',seed=10L) #try: help(KMeans.train) to see parameter options #k is the number of desired clusters. #maxIterations is the maximum number of iterations to run.
from pyspark.mllib.random import RandomRDDs from pyspark.sql.types import * #from pyspark.sql.functions import * from pyspark.sql.types import Row spark = SparkSession.builder.config("spark.sql.crossJoin.enabled", "true").getOrCreate() n = 500 # create rdd of random floats nRow = n nCol = 4 seed = 5 numPartitions = 32 rdd1 = RandomRDDs.normalVectorRDD(spark, nRow, nCol, numPartitions, seed) seed = 3 rdd2 = RandomRDDs.normalVectorRDD(spark, nRow, nCol, numPartitions, seed) sc = spark.sparkContext # convert each tuple in the rdd to a row randomNumberRdd1 = rdd1.map( lambda x: Row(A=float(x[0]), B=float(x[1]), C=float(x[2]), D=float(x[3]))) randomNumberRdd2 = rdd2.map( lambda x: Row(E=float(x[0]), F=float(x[1]), G=float(x[2]), H=float(x[3]))) # create dataframe from rdd schemaRandomNumberDF1 = spark.createDataFrame(randomNumberRdd1) schemaRandomNumberDF2 = spark.createDataFrame(randomNumberRdd2) # cache the dataframe
def test_to_java_object_rdd(self): # SPARK-6660 data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0L) self.assertEqual(_to_java_object_rdd(data).count(), 10)
def test_col_norms(self): data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) summary = Statistics.colStats(data) self.assertEqual(10, len(summary.normL1())) self.assertEqual(10, len(summary.normL2()))
def generate_csv_hdfs(spark, row, col, path, num_partition=3): sc = spark.sparkContext rdd = RandomRDDs.uniformVectorRDD(sc, row, col, num_partition) lines = rdd.map(toCSVLine) lines.saveAsTextFile(path)
def test_to_java_object_rdd(self): # SPARK-6660 data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0) self.assertEqual(_to_java_object_rdd(data).count(), 10)
def generate_spark_matrix(nrows: int, ncols: int, spark): df = RandomRDDs.uniformVectorRDD(spark.sparkContext, nrows, ncols).map(lambda a: a.tolist())\ .toDF()\ .repartition(int(nrows/partition_factor))\ .persist() return df
from pyspark import SparkContext from pyspark.mllib.random import RandomRDDs if __name__ == "__main__": if len(sys.argv) not in [1, 2]: print("Usage: random_rdd_generation", file=sys.stderr) sys.exit(-1) sc = SparkContext(appName="PythonRandomRDDGeneration") numExamples = 10000 # number of examples to generate fraction = 0.1 # fraction of data to sample # Example: RandomRDDs.normalRDD normalRDD = RandomRDDs.normalRDD(sc, numExamples) print('Generated RDD of %d examples sampled from the standard normal distribution' % normalRDD.count()) print(' First 5 samples:') for sample in normalRDD.take(5): print(' ' + str(sample)) print() # Example: RandomRDDs.normalVectorRDD normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows=numExamples, numCols=2) print('Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count()) print(' First 5 samples:') for sample in normalVectorRDD.take(5): print(' ' + str(sample)) print()
# ## Generate Spark DataFrame Data # We'll generate sample data for a multivariate linear regression with known coefficients and randomly generated error. Specifically; # $$ y = \beta_0 + \sum_i (\beta_i x_i) + \epsilon \thinspace \thinspace \thinspace \thinspace \thinspace \forall i \in {1..3}$$ # $$ \beta_0: 4 $$ # $$ \beta_1: 6 $$ # $$ \beta_2: 2 $$ # $$ \beta_3: -1 $$ from pyspark.mllib.random import RandomRDDs from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint import pandas as pd import numpy as np x1 = RandomRDDs.uniformRDD(spark, 10000).map(lambda x: 6.0 * x - 2) epsilon = RandomRDDs.normalRDD(spark, 10000).map(lambda x: 0.04 * x) def gen_poly(x): x0 = 1.0 x1 = float(x[0]) x2 = float(np.power(x1, 2)) x3 = float(np.power(x1, 3)) X = Vectors.dense(x0, x1, x2, x3) epsilon = float(x[1]) y = 4.0 + 6.0 * x1 + 2.0 * x2 - 1 * x3 + epsilon return (y, X) gen_dat = x1.zip(epsilon).map(gen_poly)
def construct_hyperplanes(num_hp_arrangements, num_hp_per_arrangement, ambient_dimension): num_hps = num_hp_arrangements * num_hp_per_arrangement all_hp_rdd = RandomRDDs.normalVectorRDD(sc, num_hps, ambient_dimension) return np.matrix(all_hp_rdd.collect())
def gen_poly(x): x1 = float(x[0]) x2 = float(np.power(x1,2)) x3 = float(np.power(x1,3)) X = Vectors.dense(x1,x2,x3) epsilon = float(x[1]) y = 4.0 + 6.0*x1 + 2.0*x2 - 1*x3 + epsilon return(y,X) def gen_df_ml(sc=None, B=(4,6,2,-1), n=10000, rng=(-2,4), err=(0,4)): sc=get_sc() if sc is None else sc x1 = RandomRDDs.uniformRDD(spark, 10000).map(lambda x: np.diff(rng)*x+np.min(rng)) epsilon = RandomRDDs.normalRDD(spark, 10000).map(lambda x: err[0]+err[1]*x) dat_df = x1.zip(epsilon).map(gen_poly) return(dat_df) gen_df_sparklyr <- function(sc=get_sc(), B=c(4,6,2,-1), n=10000, rng=c(-2,4), err=c(0,4)) { dat<-gen_dat_r(B,n,rng,err) return(copy_to(sc, gen_dat_r(),"df",TRUE)) }
count_cluster = int(sys.argv[3]) # number of clusters dimension = int(sys.argv[4]) # dimension of the data std = int(sys.argv[5]) # standard deviation noise_points = points * 2 # number of noise points to be generated / double the number of points file_name_noise = sys.argv[1] + '-noise.csv' # file name for noise points to be generated sc = SparkContext("local", "generator") # spark context # array of the clusters : clusters = [0, 1, 2] clusters = sc.parallelize(range(0, count_cluster)) # random means of each cluster : means_cluster = [ (0, [0.6, 80.9]), (1, [57.8, 20.2]), (2, [15.6, 49.9]) ] means_cluster = clusters.map(lambda cluster : (cluster, random.sample(numpy.arange(MIN_MEAN_VALUE, MAX_MEAN_VALUE, STEPS), dimension))) # creating random vector using normalVectorRDD random_values_vector = RandomRDDs.normalVectorRDD(sc, numRows = points, numCols = dimension, numPartitions = count_cluster, seed = 1L) # assiging a random cluster for each point cluster_normal_values_vector = random_values_vector.map(lambda point : (random.randint(0, count_cluster - 1), point.tolist())) # generate a value depending of the mean of the cluster, standard deviation and the normal value points_value_vector = cluster_normal_values_vector.join(means_cluster).map(lambda (cluster, (normal_value, means_value)): (point_values(means_value, normal_value, std, cluster, dimension))) print(points_value_vector.collect()) # generate random points that represent noise points noise_points_vector = sc.parallelize(range(0, noise_points)).map(lambda x : random.sample(numpy.arange(MIN_MEAN_VALUE, MAX_MEAN_VALUE, STEPS), dimension)).map(lambda v: noise_values(v)) # noise_points_vector = noise_points_vector.map(lambda row : str(row).replace("[", "").replace("]","")) print(noise_points_vector.collect())