Exemplo n.º 1
0
from pyspark import SparkContext
from pyspark.mllib.random import RandomRDDs


if __name__ == "__main__":
    if len(sys.argv) not in [1, 2]:
        print("Usage: random_rdd_generation", file=sys.stderr)
        sys.exit(-1)

    sc = SparkContext(appName="PythonRandomRDDGeneration")

    numExamples = 10000  # number of examples to generate
    fraction = 0.1  # fraction of data to sample

    # Example: RandomRDDs.normalRDD
    normalRDD = RandomRDDs.normalRDD(sc, numExamples)
    print('Generated RDD of %d examples sampled from the standard normal distribution'
          % normalRDD.count())
    print('  First 5 samples:')
    for sample in normalRDD.take(5):
        print('    ' + str(sample))
    print()

    # Example: RandomRDDs.normalVectorRDD
    normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows=numExamples, numCols=2)
    print('Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count())
    print('  First 5 samples:')
    for sample in normalVectorRDD.take(5):
        print('    ' + str(sample))
    print()
Exemplo n.º 2
0
import json
from IPython.display import Javascript, HTML

# "Import" d3. This adds d3 v5 to the output window iframe.

HTML("<script src='https://d3js.org/d3.v5.min.js'></script>")

# Create the spark context

conf = SparkConf().setAppName("strata_distributions")
sc = SparkContext(conf=conf)

# Generate a 10000 random number RDD from that follows a normal
# distribution.

x = RandomRDDs.normalRDD(sc, 10000, seed=1)
x.take(10)

# The code below creates a histogram with 500 bins from the random number
# RDD. This then converted to a json string using `json.dumps` and the
# output is pushed a the `hist` variable in the browser using the
# `Javascript`. This is then accessible as input data for d3.

Javascript("window.hist = {}".format(json.dumps(x.histogram(500)[1])))

# Create an svg element in the output window to use for the first
# impelemenation

HTML("""
<div id='div_svg'><svg id='svg_hist' width='500' height='200'></svg></div>
""")
Exemplo n.º 3
0
print(goodnessOfFitTestResults)

# pearson's independence test on a matrix
mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])
independenceTestResults = Statistics.chiSqTest(mat)
print(independenceTestResults)

# a contingency table can be constructed from an RDD of LabeledPoint/vector pairs. The resulting test returns
# a Chi-squared test results for every feature against the label
obs = sc.parallelize([
    LabeledPoint(1.0, [1.0, 0.0, 3.0]),
    LabeledPoint(1.0, [1.0, 2.0, 0.0]),
    LabeledPoint(1.0, [-1.0, 0.0, -0.5])
])
featureTestResults = Statistics.chiSqTest(obs)

for i, result in enumerate(featureTestResults):
    print('column {0}: \n {1}'.format(i, result))

## random data generation
from pyspark.mllib.random import RandomRDDs

# generate a random RDD that contains a million iid values drawn from a normal distribution N(0, 1)
# distribute evenly to 10 partitions
u = RandomRDDs.normalRDD(sc, size=1000000, numPartitions=10)
print(u.take(20))

# apply a transformation to return a random RDD that follow a normal distribution N(1, 4)
v = u.map(lambda x: 1.0 + 2.0 * x)
print(v.take(20))
Exemplo n.º 4
0
    x1 = float(x[0])
    x2 = float(np.power(x1,2))
    x3 = float(np.power(x1,3))
    X  = Vectors.dense(x1,x2,x3)  
    epsilon = float(x[1])
    y  = 4.0 + 6.0*x1 + 2.0*x2 - 1*x3 + epsilon
    return(y,X)

def gen_df_ml(sc=None,
              B=(4,6,2,-1),
              n=10000,
              rng=(-2,4),
              err=(0,4)):
  sc=get_sc() if sc is None else sc
  x1      = RandomRDDs.uniformRDD(spark, 10000).map(lambda x: np.diff(rng)*x+np.min(rng))
  epsilon = RandomRDDs.normalRDD(spark, 10000).map(lambda x: err[0]+err[1]*x)
  dat_df = x1.zip(epsilon).map(gen_poly)
  return(dat_df)



gen_df_sparklyr <- function(sc=get_sc(),
                      B=c(4,6,2,-1),
                      n=10000,
                      rng=c(-2,4),
                      err=c(0,4)) {
  dat<-gen_dat_r(B,n,rng,err)
  return(copy_to(sc, gen_dat_r(),"df",TRUE))
}

Exemplo n.º 5
0
# ## Generate Spark DataFrame Data
# We'll generate sample data for a multivariate linear regression with known coefficients and randomly generated error. Specifically;
# $$ y = \beta_0 + \sum_i (\beta_i x_i) + \epsilon   \thinspace \thinspace \thinspace \thinspace \thinspace     \forall i \in {1..3}$$
# $$ \beta_0: 4 $$
# $$ \beta_1: 6 $$
# $$ \beta_2: 2 $$
# $$ \beta_3: -1 $$

from pyspark.mllib.random import RandomRDDs
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
import pandas as pd
import numpy as np

x1 = RandomRDDs.uniformRDD(spark, 10000).map(lambda x: 6.0 * x - 2)
epsilon = RandomRDDs.normalRDD(spark, 10000).map(lambda x: 0.04 * x)


def gen_poly(x):
    x0 = 1.0
    x1 = float(x[0])
    x2 = float(np.power(x1, 2))
    x3 = float(np.power(x1, 3))
    X = Vectors.dense(x0, x1, x2, x3)
    epsilon = float(x[1])
    y = 4.0 + 6.0 * x1 + 2.0 * x2 - 1 * x3 + epsilon
    return (y, X)


gen_dat = x1.zip(epsilon).map(gen_poly)
Exemplo n.º 6
0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from pyspark.mllib.random import RandomRDDs

from pyspark import SparkContext
# $example on$
from pyspark.mllib.linalg import Matrices, Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="RandomDataGenerationExample")

    # Generate a random double RDD that contains 1 million i.i.d. values drawn from the
    # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
    u = RandomRDDs.normalRDD(sc, 1000000, 10)
    # Apply a transform to get a random double RDD following `N(1, 4)`.
    v = u.map(lambda x: 1.0 + 2.0 * x)
    print(v)
# In[2]:

import pyspark
import numpy as np
import pandas as pd
from pyspark import SQLContext
from pyspark.mllib.random import RandomRDDs
from pyspark.sql.functions import udf

# In[3]:
# define sample dataset size n
n = 10000

# test the speed of RDD random generator
gender_1 = RandomRDDs.normalRDD(sc, n, seed=1).map(lambda x: np.round(x))
gender_1.count()

# In[4]:

# test the speed of numpy random generator
gender_2 = np.random.randint(0, 2, size=n)
len(gender_2)

# For the size n, numpy is faster than spark RDD. Use numpy to build arrays
# then convert to pyspark dataframe.

# In[6]:

# create numpy arrays
biologic = np.random.randint(0, 2, size=n)
Exemplo n.º 8
0
#Benchmarking Setup - Configure here for the benchmark size required
#sizes=[100,1000, 10000, 50000, 100000, 500000, 1000000]
sizes = [100, 1000]

from pyspark import SparkContext, SparkConf
from pyspark.mllib.random import RandomRDDs

conf = SparkConf().setAppName("SVD-Datagen") \
    .set("spark.executor.cores", 4) \
    .set("spark.executor.instances", 2)

sc = SparkContext.getOrCreate(conf=conf)

#TODO - configure the S3 bucket here
s3_bucket = ""

for size in sizes:

    # Step 1 - Generating Data
    input = RandomRDDs.normalRDD(sc, size)

    # The diagonal values of A = 1:  A[i,i] = 1
    for j in range(size):
        input_array[j][j] = 1