Exemplo n.º 1
0
def housing_data_generator(start=0, n_points=1000000, n_jobs=4):

    # step = int(math.ceil(n_points/n_jobs))

    # Parallel(n_jobs=n_jobs)(delayed(house_data)(i, i+step) for i in range(start, n_points, step))

    from pyspark.mllib.random import RandomRDDs
    from pyspark.sql import SparkSession
    from pyspark.sql import DataFrame
    from pyspark.sql import SQLContext, Row

    import math
    scSpark = SparkSession \
        .builder \
        .appName("reading csv") \
        .getOrCreate()

    columns = ["id", "size", "loc", "rooms", "bathrooms", "year", "price"]

    u = RandomRDDs.uniformRDD(scSpark, n_points, 2).map(lambda x: math.ceil(
        house_size[0] + (house_size[-1] - house_size[0]) * x)).zipWithIndex()
    u = scSpark.createDataFrame(u, ["size", "id"])

    for col in columns[2:]:
        v = RandomRDDs.uniformRDD(
            scSpark, n_points, 2).map(lambda x: math.ceil(house_size[0] + (
                house_size[-1] - house_size[0]) * x)).zipWithIndex()
        v = scSpark.createDataFrame(v, [col, "id"])
        u = u.join(v, "id").select("*")

        u.show()
Exemplo n.º 2
0
def generateItemProfiles(R, d, seed, sparkContext, N):
    """ Generate the item profiles from rdd R and store them in an RDD containing tuples of the form
            (j,vj)
        where v is a random np.array of dimension d.

        The random uis are generated using normalVectorRDD(), a function in RandomRDDs.

        Inputs are:
             - R: an RDD that contains the ratings in (user, item, rating) form
             - d: the dimension of the user profiles
             - seed: a seed to be used for in generating the random vectors
             - sparkContext: a spark context
             - N: the number of partitions to be used during joins, etc.

        The return value is an RDD containing the item profiles
    """
    V = R.map(lambda (i, j, rij): j).distinct(numPartitions=N)
    numItems = V.count()
    randRDD = RandomRDDs.normalVectorRDD(sparkContext,
                                         numItems,
                                         d,
                                         numPartitions=N,
                                         seed=seed)
    V = V.zipWithIndex().map(swap)
    randRDD = randRDD.zipWithIndex().map(swap)
    return V.join(randRDD, numPartitions=N).values()
Exemplo n.º 3
0
def generateUserProfiles(R, d, seed, sparkContext, N):
    """
        Generate the user profiles from rdd R and store them in an RDD containing tuples of the form
            (i,ui)
        where u is a random np.array of dimension d.

        The random uis are generated using normalVectorRDD(), a function in RandomRDDs.

        Inputs are:
             - R: an RDD that contains the ratings in (user, item, rating) form
             - d: the dimension of the user profiles
             - seed: a seed to be used for in generating the random vectors
             - sparkContext: a spark context
             - N: the number of partitions to be used during joins, etc.

        The return value is an RDD containing the user profiles
    """
    # exctract user ids
    U = R.map(lambda inp: inp[0]).distinct(numPartitions=N)
    numUsers = U.count()
    randRDD = RandomRDDs.normalVectorRDD(sparkContext,
                                         numUsers,
                                         d,
                                         numPartitions=N,
                                         seed=seed)
    U = U.zipWithIndex().map(swap)
    randRDD = randRDD.zipWithIndex().map(swap)
    return U.join(randRDD, numPartitions=N).values()
Exemplo n.º 4
0
def gen_hash_coeffs_1(num_buckets, num_bigrams, seed):
    coeff = [0 for j in xrange(num_buckets)]
    for i in xrange(num_buckets):
        #nRDD= RandomRDDs.normalRDD(sc, num_bigrams, seed=seed+i).map(lambda val: str(-1) if val < 0 else str(1)).collect()
        #nRDD= RandomRDDs.uniformRDD(sc, num_bigrams,0,seed+i).map(lambda val: str(-1) if val <= 0.5 else str(1)).collect()
        #Str to float
        nRDD = RandomRDDs.uniformRDD(sc, num_bigrams, 0, seed + i).map(
            lambda val: float(-1) if val <= 0.5 else float(1)).collect()
        j = nRDD
        coeff[i] = j
    return coeff
Exemplo n.º 5
0
    def test_col_norms(self):
        data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
        summary = Statistics.colStats(data)
        self.assertEqual(10, len(summary.normL1()))
        self.assertEqual(10, len(summary.normL2()))

        data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
        summary2 = Statistics.colStats(data2)
        self.assertEqual(array([45.0]), summary2.normL1())
        import math
        expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
        self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
Exemplo n.º 6
0
    def test_col_norms(self):
        data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
        summary = Statistics.colStats(data)
        self.assertEqual(10, len(summary.normL1()))
        self.assertEqual(10, len(summary.normL2()))

        data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
        summary2 = Statistics.colStats(data2)
        self.assertEqual(array([45.0]), summary2.normL1())
        import math
        expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
        self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
Exemplo n.º 7
0
def find_best_params(X, y):
    spark, sc = __start_session()

    param_size = 100
    sample_size = 500
    parallelism = sc.defaultParallelism

    train_numpy = np.concatenate((X, y[:, np.newaxis]), axis=1)
    train = sc.parallelize(train_numpy) \
              .map(lambda r: [Vectors.dense(r[0]),float(r[1])]) \
              .toDF(['features','label']) \
              .repartition(parallelism) \
              .cache()

    reg_param = RandomRDDs.uniformRDD(sc, size=param_size, seed=settings.seed) \
                          .map(lambda x: 0.001 + (0.1 - 0.001) * x) \
                          .collect()

    max_iter = RandomRDDs.uniformRDD(sc, size=param_size, seed=settings.seed) \
                          .map(lambda x: int(5 + (20 - 5) * x)) \
                          .collect()

    # create random grid
    estimator = LinearRegression(solver='normal')
    param_grid = ParamGridBuilder().addGrid(estimator.regParam, reg_param) \
                                   .addGrid(estimator.maxIter, max_iter) \
                                   .build()

    param_grid = sc.parallelize(param_grid) \
                    .takeSample(withReplacement=False, num=sample_size, seed=settings.seed)

    best_params = __run_search(estimator, param_grid, train, parallelism)

    train.unpersist()
    spark.stop()

    # print results
    print('Best Params:', best_params)

    return best_params
Exemplo n.º 8
0
 def test_col_with_different_rdds(self):
     # numpy
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(1000, summary.count())
     # array
     data = self.sc.parallelize([range(10)] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
     # array
     data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
Exemplo n.º 9
0
 def test_col_with_different_rdds(self):
     # numpy
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(1000, summary.count())
     # array
     data = self.sc.parallelize([range(10)] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
     # array
     data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
Exemplo n.º 10
0
def generator_normal_rdd(sc, n, d, s, k):
    """
    :param n: Nombre de lignes (de données)
    :param d: Dimension des données (lignes)
    :param s: Ecart type
    :param k: Clusters
    :return: Liste contenant les rdds de points associés à chaque cluster
    """
    normal_rdds = []
    print("LISTE DONNEES : \n")
    for cluster, mean in mean_cluster(k).items():
        normal_rdd = RandomRDDs.logNormalVectorRDD(
            sc=sc, mean=mean, std=s, numRows=int(n / k), numCols=d,
            seed=1).map(lambda x: (list(x), cluster))
        print(normal_rdd.collect())
        normal_rdds.append(normal_rdd)
    print()
    return normal_rdds
Exemplo n.º 11
0
def main():
    if len(sys.argv) == 2:
        length = sys.argv[1]
    else:
        length = 10**3
    print("26")
    data0_1  = RandomRDDs.uniformVectorRDD(sc, length, 3) \
        .map(lambda a : a.round(3).tolist()) \
            .toDF()
    print("30")
    name = "random_data{}.parquet".format(333)  # random.randrange(200))
    print("using name=" + name)
    data0_1.write.parquet(name)
    print("33")

    read_df = spark.read.parquet(name)
    #    print(f"Read {read_df.count()} records. Should be {length} records.") # for python 3
    print("Read {} records. Should be {} records.".format(
        read_df.count(), length))  # for python 3
Exemplo n.º 12
0
""" Simple distributed implementation of the K-Means algorithm using Tensorflow.
"""

import tensorflow as tf
import tensorframes as tfs
from pyspark.mllib.random import RandomRDDs
import numpy as np

num_features = 4
k = 2
# TODO: does not work with 1
data = RandomRDDs.normalVectorRDD(
    sc,
    numCols=num_features,
    numRows=100,
    seed=1).map(lambda v: [v.tolist()])
df = sqlContext.createDataFrame(data).toDF("features")

# For now, analysis is still required.
df0 = tfs.analyze(df)

init_centers = np.random.randn(k, num_features)

# For debugging
block = np.array(data.take(10))[::,0,::]

# Find the distances first
with tf.Graph().as_default() as g:
    points = tf.placeholder(tf.double, shape=[None, num_features], name='points')
    num_points = tf.shape(points)[0]
    #centers = tf.placeholder(tf.double, shape=[k, num_features], name='centers')
Exemplo n.º 13
0
import json
from IPython.display import Javascript, HTML

# "Import" d3. This adds d3 v5 to the output window iframe.

HTML("<script src='https://d3js.org/d3.v5.min.js'></script>")

# Create the spark context

conf = SparkConf().setAppName("strata_distributions")
sc = SparkContext(conf=conf)

# Generate a 10000 random number RDD from that follows a normal
# distribution.

x = RandomRDDs.normalRDD(sc, 10000, seed=1)
x.take(10)

# The code below creates a histogram with 500 bins from the random number
# RDD. This then converted to a json string using `json.dumps` and the
# output is pushed a the `hist` variable in the browser using the
# `Javascript`. This is then accessible as input data for d3.

Javascript("window.hist = {}".format(json.dumps(x.histogram(500)[1])))

# Create an svg element in the output window to use for the first
# impelemenation

HTML("""
<div id='div_svg'><svg id='svg_hist' width='500' height='200'></svg></div>
""")
Exemplo n.º 14
0
""" Simple distributed implementation of the K-Means algorithm using Tensorflow.
"""

import tensorflow as tf
import tensorframes as tfs
from pyspark.mllib.random import RandomRDDs
import numpy as np

num_features = 4
k = 2
# TODO: does not work with 1
data = RandomRDDs.normalVectorRDD(sc,
                                  numCols=num_features,
                                  numRows=100,
                                  seed=1).map(lambda v: [v.tolist()])
df = sqlContext.createDataFrame(data).toDF("features")

# For now, analysis is still required.
df0 = tfs.analyze(df)

init_centers = np.random.randn(k, num_features)

# For debugging
block = np.array(data.take(10))[::, 0, ::]

# Find the distances first
with tf.Graph().as_default() as g:
    points = tf.placeholder(tf.double,
                            shape=[None, num_features],
                            name='points')
    num_points = tf.shape(points)[0]
Exemplo n.º 15
0
"""

Testing with Random data generation

https://spark.apache.org/docs/latest/mllib-statistics.html

"""

from pyspark.mllib.random import RandomRDDs
from pyspark import SparkContext

sc = SparkContext("local", "Rubbish")

# Generate a random double RDD that contains 1 million i.i.d. values drawn from the
# standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
u = RandomRDDs.uniformRDD(sc, 1000000L, 10)
# Apply a transform to get a random double RDD following `N(1, 4)`.
v = u.map(lambda x: 1.0 + 2.0 * x)

print v
Exemplo n.º 16
0
 def build_scenarios(self):
     nb_timesteps = self.timesteps.size
     return RandomRDDs.normalVectorRDD(sc,
                                       self.nb_scenarios,
                                       nb_timesteps,
                                       seed=1)
Exemplo n.º 17
0
print(goodnessOfFitTestResults)

# pearson's independence test on a matrix
mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])
independenceTestResults = Statistics.chiSqTest(mat)
print(independenceTestResults)

# a contingency table can be constructed from an RDD of LabeledPoint/vector pairs. The resulting test returns
# a Chi-squared test results for every feature against the label
obs = sc.parallelize([
    LabeledPoint(1.0, [1.0, 0.0, 3.0]),
    LabeledPoint(1.0, [1.0, 2.0, 0.0]),
    LabeledPoint(1.0, [-1.0, 0.0, -0.5])
])
featureTestResults = Statistics.chiSqTest(obs)

for i, result in enumerate(featureTestResults):
    print('column {0}: \n {1}'.format(i, result))

## random data generation
from pyspark.mllib.random import RandomRDDs

# generate a random RDD that contains a million iid values drawn from a normal distribution N(0, 1)
# distribute evenly to 10 partitions
u = RandomRDDs.normalRDD(sc, size=1000000, numPartitions=10)
print(u.take(20))

# apply a transformation to return a random RDD that follow a normal distribution N(1, 4)
v = u.map(lambda x: 1.0 + 2.0 * x)
print(v.take(20))
Exemplo n.º 18
0
import sys
from pyspark import SparkContext
from pyspark.mllib.random import RandomRDDs
from math import hypot


def dist(p):
    return hypot(p[0] - 0.5, p[1] - 0.5)


sc = SparkContext("local", "Monte Carlo Integration Pi Approximation")

num_samples = int(sys.argv[1])

a = RandomRDDs.uniformVectorRDD(sc, num_samples, 2)
num = a.map(dist).filter(lambda d: d < 0.5).count()

print(4 * num / num_samples)
Exemplo n.º 19
0
from pyspark.mllib.random import RandomRDDs
from pyspark import SparkContext
from pyspark.sql import Row, SparkSession
import pyspark.sql.functions as f
from pyspark.sql.functions import coalesce

sc = SparkContext()
spark = SparkSession(sc)

Rdd = RandomRDDs.uniformRDD(sc, 100044, 2)
Rdd2 = RandomRDDs.uniformRDD(sc, 100044, 2)
Rdd_cons = Rdd.map(lambda x: 102.83547008547009 + 102.85047727 * x)
Rdd_cons = Rdd_cons.sortBy(lambda x: x)
Rdd_pop = Rdd2.map(lambda x: 3401 + 150000 * x)
Rdd_pop = Rdd_pop.sortBy(lambda x: x)
Rdd_pop = Rdd_pop.map(lambda x: int(x + 6071639))
mois = []
for i in range(100044):
    mois.append(i + 1)
Rdd_mois = sc.parallelize(mois, 2)
colone1 = Row("consomation")
colone2 = Row("population")
colone3 = Row("mois")
df_cons = Rdd_cons.map(colone1).toDF()
df_pop = Rdd_pop.map(colone2).toDF()
df_mois = Rdd_mois.map(colone3).toDF()
df_mois = df_mois.withColumn('ligne_id', f.monotonically_increasing_id())
df_pop = df_pop.withColumn('ligne_id', f.monotonically_increasing_id())
df_cons = df_cons.withColumn('ligne_id', f.monotonically_increasing_id())
df = df_mois.join(df_pop, on=["ligne_id"]).sort("ligne_id")
df = df.join(df_cons, on=["ligne_id"]).sort("ligne_id")
Exemplo n.º 20
0
def generate_random_uniform_df(nrows, ncols):
    df = RandomRDDs.uniformVectorRDD(spark.sparkContext, nrows,
                                     ncols).map(lambda a: a.tolist()).toDF()
    return df
Exemplo n.º 21
0
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.mllib.random import RandomRDDs
from time import time

print("########################################")
print("STARTING")
print("########################################")

sc = SparkContext(appName="speedtest-nrb")
sql_context = SQLContext(sc)
start = time()

t = time()
print("creating dataframes df and df2")
df = RandomRDDs.uniformVectorRDD(sc, 100000000,
                                 2).map(lambda a: a.tolist()).toDF()
df2 = RandomRDDs.uniformVectorRDD(sc, 100000000,
                                  2).map(lambda a: a.tolist()).toDF()
print(time() - t)

t = time()
print("counting df")
df.count()
print(time() - t)

t = time()
print("counting df")
df.count()
print(time() - t)

t = time()
Exemplo n.º 22
0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from pyspark.mllib.random import RandomRDDs

from pyspark import SparkContext
# $example on$
from pyspark.mllib.linalg import Matrices, Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="RandomDataGenerationExample")

    # Generate a random double RDD that contains 1 million i.i.d. values drawn from the
    # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
    u = RandomRDDs.normalRDD(sc, 1000000, 10)
    # Apply a transform to get a random double RDD following `N(1, 4)`.
    v = u.map(lambda x: 1.0 + 2.0 * x)
    print(v)
Exemplo n.º 23
0
#!/usr/bin/env python
# coding: utf-8

# In[65]:

import numpy as np
from pyspark.mllib.random import RandomRDDs
from pyspark.mllib.clustering import KMeans, KMeansModel

# In[66]:

c1_v = RandomRDDs.normalVectorRDD(sc, 20, 2, numPartitions=2,
                                  seed=1).map(lambda v: np.add([1, 5], v))

# In[67]:

c1_v.stats()

# In[68]:

c2_v = RandomRDDs.normalVectorRDD(sc, 20, 2, numPartitions=2,
                                  seed=1).map(lambda v: np.add([5, 1], v))

# In[69]:

c2_v.stats()

# In[70]:

c3_v = RandomRDDs.normalVectorRDD(sc, 20, 2, numPartitions=2,
                                  seed=1).map(lambda v: np.add([4, 6], v))

pts = int(30)  # number of pts to be generated
k = int(3)  # number of rdd
dim = int(4)  # dim of the data

for i in range(1, 100):
    dev = int(i)
    file_name = "out_dev_" + str(i) + '.csv'
    rdd = sc.parallelize(range(0, k))
    clust_mean = rdd.map(lambda cluster: (
        cluster,
        random.sample(list(numpy.arange(val_min, val_max, ETAPES)), dim)))
    valeurs_vector_alea = RandomRDDs.normalVectorRDD(sc,
                                                     numRows=pts,
                                                     numCols=dim,
                                                     numPartitions=k,
                                                     seed=1)
    # assiging a random cluster for each point
    cluster_valeur_normales_vector = valeurs_vector_alea.map(
        lambda point: (random.randint(0, k - 1), point.tolist()))
    # generate a valeur depending of the mean of the cluster, standard deviation and the normal valeur
    pts_valeur_vector = cluster_valeur_normales_vector.join(clust_mean).map(
        lambda x: (point_valeurs(x[1][1], x[1][0], dev, x[0], dim)))
    #Voir le resultat
    print(pts_valeur_vector.collect())
    # writing pts valeur in a 1 csv file
    # write_into_csv(file_name, pts_valeur_vector);
    # saving rdd using saveAsTextFile
    pts_valeur_vector.saveAsTextFile(file_name)
Exemplo n.º 25
0
#A script to execute kmeans clustering in spark
#to run enter: >>> exec(open("./dokmeans.py").read())

import numpy as np
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import KMeans


#generate random data RDD we need this package
from pyspark.mllib.random import RandomRDDs

#let's generate random class data, add in a cluster center to random 2D points

#use default num of partitions, or use a definte number to make it so that the union
#  will have samples across clusters
c1_v=RandomRDDs.normalVectorRDD(sc,20,2,numPartitions=2,seed=1L).map(lambda v:np.add([1,5],v))
c2_v=RandomRDDs.normalVectorRDD(sc,16,2,numPartitions=2,seed=2L).map(lambda v:np.add([5,1],v))
c3_v=RandomRDDs.normalVectorRDD(sc,12,2,numPartitions=2,seed=3L).map(lambda v:np.add([4,6],v))

#concatenate 2 RDDs with  .union(other) function
c12    =c1_v.union(c2_v)
my_data=c12.union(c3_v)   #this now has all points, as RDD


my_kmmodel = KMeans.train(my_data,k=1,
               maxIterations=20,runs=1,
               initializationMode='k-means||',seed=10L)

#try: help(KMeans.train)  to see parameter options
#k is the number of desired clusters.
#maxIterations is the maximum number of iterations to run.
Exemplo n.º 26
0
from pyspark.mllib.random import RandomRDDs
from pyspark.sql.types import *
#from pyspark.sql.functions import *
from pyspark.sql.types import Row
spark = SparkSession.builder.config("spark.sql.crossJoin.enabled",
                                    "true").getOrCreate()

n = 500

# create rdd of random floats
nRow = n
nCol = 4
seed = 5
numPartitions = 32

rdd1 = RandomRDDs.normalVectorRDD(spark, nRow, nCol, numPartitions, seed)
seed = 3
rdd2 = RandomRDDs.normalVectorRDD(spark, nRow, nCol, numPartitions, seed)
sc = spark.sparkContext

# convert each tuple in the rdd to a row
randomNumberRdd1 = rdd1.map(
    lambda x: Row(A=float(x[0]), B=float(x[1]), C=float(x[2]), D=float(x[3])))
randomNumberRdd2 = rdd2.map(
    lambda x: Row(E=float(x[0]), F=float(x[1]), G=float(x[2]), H=float(x[3])))

# create dataframe from rdd
schemaRandomNumberDF1 = spark.createDataFrame(randomNumberRdd1)
schemaRandomNumberDF2 = spark.createDataFrame(randomNumberRdd2)

# cache the dataframe
Exemplo n.º 27
0
 def test_to_java_object_rdd(self):  # SPARK-6660
     data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0L)
     self.assertEqual(_to_java_object_rdd(data).count(), 10)
Exemplo n.º 28
0
 def test_col_norms(self):
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, len(summary.normL1()))
     self.assertEqual(10, len(summary.normL2()))
Exemplo n.º 29
0
def generate_csv_hdfs(spark, row, col, path, num_partition=3):
    sc = spark.sparkContext
    rdd = RandomRDDs.uniformVectorRDD(sc, row, col, num_partition)
    lines = rdd.map(toCSVLine)
    lines.saveAsTextFile(path)
Exemplo n.º 30
0
 def test_to_java_object_rdd(self):  # SPARK-6660
     data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0)
     self.assertEqual(_to_java_object_rdd(data).count(), 10)
Exemplo n.º 31
0
def generate_spark_matrix(nrows: int, ncols: int, spark):
    df = RandomRDDs.uniformVectorRDD(spark.sparkContext, nrows, ncols).map(lambda a: a.tolist())\
        .toDF()\
        .repartition(int(nrows/partition_factor))\
        .persist()
    return df
Exemplo n.º 32
0
from pyspark import SparkContext
from pyspark.mllib.random import RandomRDDs


if __name__ == "__main__":
    if len(sys.argv) not in [1, 2]:
        print("Usage: random_rdd_generation", file=sys.stderr)
        sys.exit(-1)

    sc = SparkContext(appName="PythonRandomRDDGeneration")

    numExamples = 10000  # number of examples to generate
    fraction = 0.1  # fraction of data to sample

    # Example: RandomRDDs.normalRDD
    normalRDD = RandomRDDs.normalRDD(sc, numExamples)
    print('Generated RDD of %d examples sampled from the standard normal distribution'
          % normalRDD.count())
    print('  First 5 samples:')
    for sample in normalRDD.take(5):
        print('    ' + str(sample))
    print()

    # Example: RandomRDDs.normalVectorRDD
    normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows=numExamples, numCols=2)
    print('Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count())
    print('  First 5 samples:')
    for sample in normalVectorRDD.take(5):
        print('    ' + str(sample))
    print()
Exemplo n.º 33
0
# ## Generate Spark DataFrame Data
# We'll generate sample data for a multivariate linear regression with known coefficients and randomly generated error. Specifically;
# $$ y = \beta_0 + \sum_i (\beta_i x_i) + \epsilon   \thinspace \thinspace \thinspace \thinspace \thinspace     \forall i \in {1..3}$$
# $$ \beta_0: 4 $$
# $$ \beta_1: 6 $$
# $$ \beta_2: 2 $$
# $$ \beta_3: -1 $$

from pyspark.mllib.random import RandomRDDs
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
import pandas as pd
import numpy as np

x1 = RandomRDDs.uniformRDD(spark, 10000).map(lambda x: 6.0 * x - 2)
epsilon = RandomRDDs.normalRDD(spark, 10000).map(lambda x: 0.04 * x)


def gen_poly(x):
    x0 = 1.0
    x1 = float(x[0])
    x2 = float(np.power(x1, 2))
    x3 = float(np.power(x1, 3))
    X = Vectors.dense(x0, x1, x2, x3)
    epsilon = float(x[1])
    y = 4.0 + 6.0 * x1 + 2.0 * x2 - 1 * x3 + epsilon
    return (y, X)


gen_dat = x1.zip(epsilon).map(gen_poly)
Exemplo n.º 34
0
def construct_hyperplanes(num_hp_arrangements, num_hp_per_arrangement,
                          ambient_dimension):
    num_hps = num_hp_arrangements * num_hp_per_arrangement
    all_hp_rdd = RandomRDDs.normalVectorRDD(sc, num_hps, ambient_dimension)
    return np.matrix(all_hp_rdd.collect())
Exemplo n.º 35
0
def gen_poly(x):
    x1 = float(x[0])
    x2 = float(np.power(x1,2))
    x3 = float(np.power(x1,3))
    X  = Vectors.dense(x1,x2,x3)  
    epsilon = float(x[1])
    y  = 4.0 + 6.0*x1 + 2.0*x2 - 1*x3 + epsilon
    return(y,X)

def gen_df_ml(sc=None,
              B=(4,6,2,-1),
              n=10000,
              rng=(-2,4),
              err=(0,4)):
  sc=get_sc() if sc is None else sc
  x1      = RandomRDDs.uniformRDD(spark, 10000).map(lambda x: np.diff(rng)*x+np.min(rng))
  epsilon = RandomRDDs.normalRDD(spark, 10000).map(lambda x: err[0]+err[1]*x)
  dat_df = x1.zip(epsilon).map(gen_poly)
  return(dat_df)



gen_df_sparklyr <- function(sc=get_sc(),
                      B=c(4,6,2,-1),
                      n=10000,
                      rng=c(-2,4),
                      err=c(0,4)) {
  dat<-gen_dat_r(B,n,rng,err)
  return(copy_to(sc, gen_dat_r(),"df",TRUE))
}
Exemplo n.º 36
0
count_cluster = int(sys.argv[3]) # number of clusters
dimension = int(sys.argv[4]) # dimension of the data
std = int(sys.argv[5]) # standard deviation
noise_points = points * 2 # number of noise points to be generated / double the number of points
file_name_noise = sys.argv[1] + '-noise.csv' # file name for noise points to be generated

sc = SparkContext("local", "generator") # spark context

# array of the clusters : clusters = [0, 1, 2]
clusters = sc.parallelize(range(0, count_cluster))

# random means of each cluster : means_cluster = [ (0, [0.6, 80.9]), (1, [57.8, 20.2]), (2, [15.6, 49.9]) ]
means_cluster = clusters.map(lambda cluster : (cluster, random.sample(numpy.arange(MIN_MEAN_VALUE, MAX_MEAN_VALUE, STEPS), dimension)))

# creating random vector using normalVectorRDD 
random_values_vector = RandomRDDs.normalVectorRDD(sc, numRows = points, numCols = dimension, numPartitions = count_cluster, seed = 1L)

# assiging a random cluster for each point
cluster_normal_values_vector = random_values_vector.map(lambda point : (random.randint(0, count_cluster - 1), point.tolist()))

# generate a value depending of the mean of the cluster, standard deviation and the normal value 
points_value_vector = cluster_normal_values_vector.join(means_cluster).map(lambda (cluster, (normal_value, means_value)): (point_values(means_value, normal_value, std, cluster, dimension)))

print(points_value_vector.collect())

# generate random points that represent noise points
noise_points_vector = sc.parallelize(range(0, noise_points)).map(lambda x : random.sample(numpy.arange(MIN_MEAN_VALUE, MAX_MEAN_VALUE, STEPS), dimension)).map(lambda v: noise_values(v))
        
# noise_points_vector = noise_points_vector.map(lambda row : str(row).replace("[", "").replace("]",""))
print(noise_points_vector.collect())
Exemplo n.º 37
0
 def test_col_norms(self):
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, len(summary.normL1()))
     self.assertEqual(10, len(summary.normL2()))