Exemplo n.º 1
0
def generateUserProfiles(R, d, seed, sparkContext, N):
    """
        Generate the user profiles from rdd R and store them in an RDD containing tuples of the form
            (i,ui)
        where u is a random np.array of dimension d.

        The random uis are generated using normalVectorRDD(), a function in RandomRDDs.

        Inputs are:
             - R: an RDD that contains the ratings in (user, item, rating) form
             - d: the dimension of the user profiles
             - seed: a seed to be used for in generating the random vectors
             - sparkContext: a spark context
             - N: the number of partitions to be used during joins, etc.

        The return value is an RDD containing the user profiles
    """
    # exctract user ids
    U = R.map(lambda inp: inp[0]).distinct(numPartitions=N)
    numUsers = U.count()
    randRDD = RandomRDDs.normalVectorRDD(sparkContext,
                                         numUsers,
                                         d,
                                         numPartitions=N,
                                         seed=seed)
    U = U.zipWithIndex().map(swap)
    randRDD = randRDD.zipWithIndex().map(swap)
    return U.join(randRDD, numPartitions=N).values()
Exemplo n.º 2
0
def generateItemProfiles(R, d, seed, sparkContext, N):
    """ Generate the item profiles from rdd R and store them in an RDD containing tuples of the form
            (j,vj)
        where v is a random np.array of dimension d.

        The random uis are generated using normalVectorRDD(), a function in RandomRDDs.

        Inputs are:
             - R: an RDD that contains the ratings in (user, item, rating) form
             - d: the dimension of the user profiles
             - seed: a seed to be used for in generating the random vectors
             - sparkContext: a spark context
             - N: the number of partitions to be used during joins, etc.

        The return value is an RDD containing the item profiles
    """
    V = R.map(lambda (i, j, rij): j).distinct(numPartitions=N)
    numItems = V.count()
    randRDD = RandomRDDs.normalVectorRDD(sparkContext,
                                         numItems,
                                         d,
                                         numPartitions=N,
                                         seed=seed)
    V = V.zipWithIndex().map(swap)
    randRDD = randRDD.zipWithIndex().map(swap)
    return V.join(randRDD, numPartitions=N).values()
Exemplo n.º 3
0
    def test_col_norms(self):
        data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
        summary = Statistics.colStats(data)
        self.assertEqual(10, len(summary.normL1()))
        self.assertEqual(10, len(summary.normL2()))

        data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
        summary2 = Statistics.colStats(data2)
        self.assertEqual(array([45.0]), summary2.normL1())
        import math
        expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
        self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
Exemplo n.º 4
0
    def test_col_norms(self):
        data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
        summary = Statistics.colStats(data)
        self.assertEqual(10, len(summary.normL1()))
        self.assertEqual(10, len(summary.normL2()))

        data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
        summary2 = Statistics.colStats(data2)
        self.assertEqual(array([45.0]), summary2.normL1())
        import math
        expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
        self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
Exemplo n.º 5
0
 def test_col_with_different_rdds(self):
     # numpy
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(1000, summary.count())
     # array
     data = self.sc.parallelize([range(10)] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
     # array
     data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
Exemplo n.º 6
0
 def test_col_with_different_rdds(self):
     # numpy
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(1000, summary.count())
     # array
     data = self.sc.parallelize([range(10)] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
     # array
     data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
Exemplo n.º 7
0
from pyspark.mllib.random import RandomRDDs
from pyspark.sql.types import *
#from pyspark.sql.functions import *
from pyspark.sql.types import Row
spark = SparkSession.builder.config("spark.sql.crossJoin.enabled",
                                    "true").getOrCreate()

n = 500

# create rdd of random floats
nRow = n
nCol = 4
seed = 5
numPartitions = 32

rdd1 = RandomRDDs.normalVectorRDD(spark, nRow, nCol, numPartitions, seed)
seed = 3
rdd2 = RandomRDDs.normalVectorRDD(spark, nRow, nCol, numPartitions, seed)
sc = spark.sparkContext

# convert each tuple in the rdd to a row
randomNumberRdd1 = rdd1.map(
    lambda x: Row(A=float(x[0]), B=float(x[1]), C=float(x[2]), D=float(x[3])))
randomNumberRdd2 = rdd2.map(
    lambda x: Row(E=float(x[0]), F=float(x[1]), G=float(x[2]), H=float(x[3])))

# create dataframe from rdd
schemaRandomNumberDF1 = spark.createDataFrame(randomNumberRdd1)
schemaRandomNumberDF2 = spark.createDataFrame(randomNumberRdd2)

# cache the dataframe
Exemplo n.º 8
0
def generate_random_vector(sc, path, numRows, numCols, partitions,
                           standard_deviation):
    normalRDD = RandomRDDs.normalVectorRDD(sc, numRows, numCols, partitions, seed=1)\
                           .map(lambda l: 10 + standard_deviation * l)\
                           .map(lambda l: [l[0], l[1], l[2], l[3], random.randint(0, partitions - 1)])
    return normalRDD
Exemplo n.º 9
0
""" Simple distributed implementation of the K-Means algorithm using Tensorflow.
"""

import tensorflow as tf
import tensorframes as tfs
from pyspark.mllib.random import RandomRDDs
import numpy as np

num_features = 4
k = 2
# TODO: does not work with 1
data = RandomRDDs.normalVectorRDD(
    sc,
    numCols=num_features,
    numRows=100,
    seed=1).map(lambda v: [v.tolist()])
df = sqlContext.createDataFrame(data).toDF("features")

# For now, analysis is still required.
df0 = tfs.analyze(df)

init_centers = np.random.randn(k, num_features)

# For debugging
block = np.array(data.take(10))[::,0,::]

# Find the distances first
with tf.Graph().as_default() as g:
    points = tf.placeholder(tf.double, shape=[None, num_features], name='points')
    num_points = tf.shape(points)[0]
    #centers = tf.placeholder(tf.double, shape=[k, num_features], name='centers')
Exemplo n.º 10
0
def construct_hyperplanes(num_hp_arrangements, num_hp_per_arrangement,
                          ambient_dimension):
    num_hps = num_hp_arrangements * num_hp_per_arrangement
    all_hp_rdd = RandomRDDs.normalVectorRDD(sc, num_hps, ambient_dimension)
    return np.matrix(all_hp_rdd.collect())
Exemplo n.º 11
0
""" Simple distributed implementation of the K-Means algorithm using Tensorflow.
"""

import tensorflow as tf
import tensorframes as tfs
from pyspark.mllib.random import RandomRDDs
import numpy as np

num_features = 4
k = 2
# TODO: does not work with 1
data = RandomRDDs.normalVectorRDD(sc,
                                  numCols=num_features,
                                  numRows=100,
                                  seed=1).map(lambda v: [v.tolist()])
df = sqlContext.createDataFrame(data).toDF("features")

# For now, analysis is still required.
df0 = tfs.analyze(df)

init_centers = np.random.randn(k, num_features)

# For debugging
block = np.array(data.take(10))[::, 0, ::]

# Find the distances first
with tf.Graph().as_default() as g:
    points = tf.placeholder(tf.double,
                            shape=[None, num_features],
                            name='points')
    num_points = tf.shape(points)[0]
Exemplo n.º 12
0
 def build_scenarios(self):
     nb_timesteps = self.timesteps.size
     return RandomRDDs.normalVectorRDD(sc,
                                       self.nb_scenarios,
                                       nb_timesteps,
                                       seed=1)
Exemplo n.º 13
0
 def test_col_norms(self):
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, len(summary.normL1()))
     self.assertEqual(10, len(summary.normL2()))
Exemplo n.º 14
0

if __name__ == "__main__":
    if len(sys.argv) not in [1, 2]:
        print("Usage: random_rdd_generation", file=sys.stderr)
        sys.exit(-1)

    sc = SparkContext(appName="PythonRandomRDDGeneration")

    numExamples = 10000  # number of examples to generate
    fraction = 0.1  # fraction of data to sample

    # Example: RandomRDDs.normalRDD
    normalRDD = RandomRDDs.normalRDD(sc, numExamples)
    print('Generated RDD of %d examples sampled from the standard normal distribution'
          % normalRDD.count())
    print('  First 5 samples:')
    for sample in normalRDD.take(5):
        print('    ' + str(sample))
    print()

    # Example: RandomRDDs.normalVectorRDD
    normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows=numExamples, numCols=2)
    print('Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count())
    print('  First 5 samples:')
    for sample in normalVectorRDD.take(5):
        print('    ' + str(sample))
    print()

    sc.stop()
Exemplo n.º 15
0
#!/usr/bin/env python
# coding: utf-8

# In[65]:

import numpy as np
from pyspark.mllib.random import RandomRDDs
from pyspark.mllib.clustering import KMeans, KMeansModel

# In[66]:

c1_v = RandomRDDs.normalVectorRDD(sc, 20, 2, numPartitions=2,
                                  seed=1).map(lambda v: np.add([1, 5], v))

# In[67]:

c1_v.stats()

# In[68]:

c2_v = RandomRDDs.normalVectorRDD(sc, 20, 2, numPartitions=2,
                                  seed=1).map(lambda v: np.add([5, 1], v))

# In[69]:

c2_v.stats()

# In[70]:

c3_v = RandomRDDs.normalVectorRDD(sc, 20, 2, numPartitions=2,
                                  seed=1).map(lambda v: np.add([4, 6], v))
Exemplo n.º 16
0
#A script to execute kmeans clustering in spark
#to run enter: >>> exec(open("./dokmeans.py").read())

import numpy as np
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import KMeans


#generate random data RDD we need this package
from pyspark.mllib.random import RandomRDDs

#let's generate random class data, add in a cluster center to random 2D points

#use default num of partitions, or use a definte number to make it so that the union
#  will have samples across clusters
c1_v=RandomRDDs.normalVectorRDD(sc,20,2,numPartitions=2,seed=1L).map(lambda v:np.add([1,5],v))
c2_v=RandomRDDs.normalVectorRDD(sc,16,2,numPartitions=2,seed=2L).map(lambda v:np.add([5,1],v))
c3_v=RandomRDDs.normalVectorRDD(sc,12,2,numPartitions=2,seed=3L).map(lambda v:np.add([4,6],v))

#concatenate 2 RDDs with  .union(other) function
c12    =c1_v.union(c2_v)
my_data=c12.union(c3_v)   #this now has all points, as RDD


my_kmmodel = KMeans.train(my_data,k=1,
               maxIterations=20,runs=1,
               initializationMode='k-means||',seed=10L)

#try: help(KMeans.train)  to see parameter options
#k is the number of desired clusters.
#maxIterations is the maximum number of iterations to run.

pts = int(30)  # number of pts to be generated
k = int(3)  # number of rdd
dim = int(4)  # dim of the data

for i in range(1, 100):
    dev = int(i)
    file_name = "out_dev_" + str(i) + '.csv'
    rdd = sc.parallelize(range(0, k))
    clust_mean = rdd.map(lambda cluster: (
        cluster,
        random.sample(list(numpy.arange(val_min, val_max, ETAPES)), dim)))
    valeurs_vector_alea = RandomRDDs.normalVectorRDD(sc,
                                                     numRows=pts,
                                                     numCols=dim,
                                                     numPartitions=k,
                                                     seed=1)
    # assiging a random cluster for each point
    cluster_valeur_normales_vector = valeurs_vector_alea.map(
        lambda point: (random.randint(0, k - 1), point.tolist()))
    # generate a valeur depending of the mean of the cluster, standard deviation and the normal valeur
    pts_valeur_vector = cluster_valeur_normales_vector.join(clust_mean).map(
        lambda x: (point_valeurs(x[1][1], x[1][0], dev, x[0], dim)))
    #Voir le resultat
    print(pts_valeur_vector.collect())
    # writing pts valeur in a 1 csv file
    # write_into_csv(file_name, pts_valeur_vector);
    # saving rdd using saveAsTextFile
    pts_valeur_vector.saveAsTextFile(file_name)
Exemplo n.º 18
0
count_cluster = int(sys.argv[3]) # number of clusters
dimension = int(sys.argv[4]) # dimension of the data
std = int(sys.argv[5]) # standard deviation
noise_points = points * 2 # number of noise points to be generated / double the number of points
file_name_noise = sys.argv[1] + '-noise.csv' # file name for noise points to be generated

sc = SparkContext("local", "generator") # spark context

# array of the clusters : clusters = [0, 1, 2]
clusters = sc.parallelize(range(0, count_cluster))

# random means of each cluster : means_cluster = [ (0, [0.6, 80.9]), (1, [57.8, 20.2]), (2, [15.6, 49.9]) ]
means_cluster = clusters.map(lambda cluster : (cluster, random.sample(numpy.arange(MIN_MEAN_VALUE, MAX_MEAN_VALUE, STEPS), dimension)))

# creating random vector using normalVectorRDD 
random_values_vector = RandomRDDs.normalVectorRDD(sc, numRows = points, numCols = dimension, numPartitions = count_cluster, seed = 1L)

# assiging a random cluster for each point
cluster_normal_values_vector = random_values_vector.map(lambda point : (random.randint(0, count_cluster - 1), point.tolist()))

# generate a value depending of the mean of the cluster, standard deviation and the normal value 
points_value_vector = cluster_normal_values_vector.join(means_cluster).map(lambda (cluster, (normal_value, means_value)): (point_values(means_value, normal_value, std, cluster, dimension)))

print(points_value_vector.collect())

# generate random points that represent noise points
noise_points_vector = sc.parallelize(range(0, noise_points)).map(lambda x : random.sample(numpy.arange(MIN_MEAN_VALUE, MAX_MEAN_VALUE, STEPS), dimension)).map(lambda v: noise_values(v))
        
# noise_points_vector = noise_points_vector.map(lambda row : str(row).replace("[", "").replace("]",""))
print(noise_points_vector.collect())
Exemplo n.º 19
0
 def test_col_norms(self):
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, len(summary.normL1()))
     self.assertEqual(10, len(summary.normL2()))