from pyspark import SparkContext from pyspark.mllib.random import RandomRDDs if __name__ == "__main__": if len(sys.argv) not in [1, 2]: print("Usage: random_rdd_generation", file=sys.stderr) sys.exit(-1) sc = SparkContext(appName="PythonRandomRDDGeneration") numExamples = 10000 # number of examples to generate fraction = 0.1 # fraction of data to sample # Example: RandomRDDs.normalRDD normalRDD = RandomRDDs.normalRDD(sc, numExamples) print('Generated RDD of %d examples sampled from the standard normal distribution' % normalRDD.count()) print(' First 5 samples:') for sample in normalRDD.take(5): print(' ' + str(sample)) print() # Example: RandomRDDs.normalVectorRDD normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows=numExamples, numCols=2) print('Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count()) print(' First 5 samples:') for sample in normalVectorRDD.take(5): print(' ' + str(sample)) print()
import json from IPython.display import Javascript, HTML # "Import" d3. This adds d3 v5 to the output window iframe. HTML("<script src='https://d3js.org/d3.v5.min.js'></script>") # Create the spark context conf = SparkConf().setAppName("strata_distributions") sc = SparkContext(conf=conf) # Generate a 10000 random number RDD from that follows a normal # distribution. x = RandomRDDs.normalRDD(sc, 10000, seed=1) x.take(10) # The code below creates a histogram with 500 bins from the random number # RDD. This then converted to a json string using `json.dumps` and the # output is pushed a the `hist` variable in the browser using the # `Javascript`. This is then accessible as input data for d3. Javascript("window.hist = {}".format(json.dumps(x.histogram(500)[1]))) # Create an svg element in the output window to use for the first # impelemenation HTML(""" <div id='div_svg'><svg id='svg_hist' width='500' height='200'></svg></div> """)
print(goodnessOfFitTestResults) # pearson's independence test on a matrix mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) independenceTestResults = Statistics.chiSqTest(mat) print(independenceTestResults) # a contingency table can be constructed from an RDD of LabeledPoint/vector pairs. The resulting test returns # a Chi-squared test results for every feature against the label obs = sc.parallelize([ LabeledPoint(1.0, [1.0, 0.0, 3.0]), LabeledPoint(1.0, [1.0, 2.0, 0.0]), LabeledPoint(1.0, [-1.0, 0.0, -0.5]) ]) featureTestResults = Statistics.chiSqTest(obs) for i, result in enumerate(featureTestResults): print('column {0}: \n {1}'.format(i, result)) ## random data generation from pyspark.mllib.random import RandomRDDs # generate a random RDD that contains a million iid values drawn from a normal distribution N(0, 1) # distribute evenly to 10 partitions u = RandomRDDs.normalRDD(sc, size=1000000, numPartitions=10) print(u.take(20)) # apply a transformation to return a random RDD that follow a normal distribution N(1, 4) v = u.map(lambda x: 1.0 + 2.0 * x) print(v.take(20))
x1 = float(x[0]) x2 = float(np.power(x1,2)) x3 = float(np.power(x1,3)) X = Vectors.dense(x1,x2,x3) epsilon = float(x[1]) y = 4.0 + 6.0*x1 + 2.0*x2 - 1*x3 + epsilon return(y,X) def gen_df_ml(sc=None, B=(4,6,2,-1), n=10000, rng=(-2,4), err=(0,4)): sc=get_sc() if sc is None else sc x1 = RandomRDDs.uniformRDD(spark, 10000).map(lambda x: np.diff(rng)*x+np.min(rng)) epsilon = RandomRDDs.normalRDD(spark, 10000).map(lambda x: err[0]+err[1]*x) dat_df = x1.zip(epsilon).map(gen_poly) return(dat_df) gen_df_sparklyr <- function(sc=get_sc(), B=c(4,6,2,-1), n=10000, rng=c(-2,4), err=c(0,4)) { dat<-gen_dat_r(B,n,rng,err) return(copy_to(sc, gen_dat_r(),"df",TRUE)) }
# ## Generate Spark DataFrame Data # We'll generate sample data for a multivariate linear regression with known coefficients and randomly generated error. Specifically; # $$ y = \beta_0 + \sum_i (\beta_i x_i) + \epsilon \thinspace \thinspace \thinspace \thinspace \thinspace \forall i \in {1..3}$$ # $$ \beta_0: 4 $$ # $$ \beta_1: 6 $$ # $$ \beta_2: 2 $$ # $$ \beta_3: -1 $$ from pyspark.mllib.random import RandomRDDs from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint import pandas as pd import numpy as np x1 = RandomRDDs.uniformRDD(spark, 10000).map(lambda x: 6.0 * x - 2) epsilon = RandomRDDs.normalRDD(spark, 10000).map(lambda x: 0.04 * x) def gen_poly(x): x0 = 1.0 x1 = float(x[0]) x2 = float(np.power(x1, 2)) x3 = float(np.power(x1, 3)) X = Vectors.dense(x0, x1, x2, x3) epsilon = float(x[1]) y = 4.0 + 6.0 * x1 + 2.0 * x2 - 1 * x3 + epsilon return (y, X) gen_dat = x1.zip(epsilon).map(gen_poly)
# (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from pyspark.mllib.random import RandomRDDs from pyspark import SparkContext # $example on$ from pyspark.mllib.linalg import Matrices, Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics # $example off$ if __name__ == "__main__": sc = SparkContext(appName="RandomDataGenerationExample") # Generate a random double RDD that contains 1 million i.i.d. values drawn from the # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. u = RandomRDDs.normalRDD(sc, 1000000, 10) # Apply a transform to get a random double RDD following `N(1, 4)`. v = u.map(lambda x: 1.0 + 2.0 * x) print(v)
# In[2]: import pyspark import numpy as np import pandas as pd from pyspark import SQLContext from pyspark.mllib.random import RandomRDDs from pyspark.sql.functions import udf # In[3]: # define sample dataset size n n = 10000 # test the speed of RDD random generator gender_1 = RandomRDDs.normalRDD(sc, n, seed=1).map(lambda x: np.round(x)) gender_1.count() # In[4]: # test the speed of numpy random generator gender_2 = np.random.randint(0, 2, size=n) len(gender_2) # For the size n, numpy is faster than spark RDD. Use numpy to build arrays # then convert to pyspark dataframe. # In[6]: # create numpy arrays biologic = np.random.randint(0, 2, size=n)
#Benchmarking Setup - Configure here for the benchmark size required #sizes=[100,1000, 10000, 50000, 100000, 500000, 1000000] sizes = [100, 1000] from pyspark import SparkContext, SparkConf from pyspark.mllib.random import RandomRDDs conf = SparkConf().setAppName("SVD-Datagen") \ .set("spark.executor.cores", 4) \ .set("spark.executor.instances", 2) sc = SparkContext.getOrCreate(conf=conf) #TODO - configure the S3 bucket here s3_bucket = "" for size in sizes: # Step 1 - Generating Data input = RandomRDDs.normalRDD(sc, size) # The diagonal values of A = 1: A[i,i] = 1 for j in range(size): input_array[j][j] = 1