def housing_data_generator(start=0, n_points=1000000, n_jobs=4): # step = int(math.ceil(n_points/n_jobs)) # Parallel(n_jobs=n_jobs)(delayed(house_data)(i, i+step) for i in range(start, n_points, step)) from pyspark.mllib.random import RandomRDDs from pyspark.sql import SparkSession from pyspark.sql import DataFrame from pyspark.sql import SQLContext, Row import math scSpark = SparkSession \ .builder \ .appName("reading csv") \ .getOrCreate() columns = ["id", "size", "loc", "rooms", "bathrooms", "year", "price"] u = RandomRDDs.uniformRDD(scSpark, n_points, 2).map(lambda x: math.ceil( house_size[0] + (house_size[-1] - house_size[0]) * x)).zipWithIndex() u = scSpark.createDataFrame(u, ["size", "id"]) for col in columns[2:]: v = RandomRDDs.uniformRDD( scSpark, n_points, 2).map(lambda x: math.ceil(house_size[0] + ( house_size[-1] - house_size[0]) * x)).zipWithIndex() v = scSpark.createDataFrame(v, [col, "id"]) u = u.join(v, "id").select("*") u.show()
def gen_hash_coeffs_1(num_buckets, num_bigrams, seed): coeff = [0 for j in xrange(num_buckets)] for i in xrange(num_buckets): #nRDD= RandomRDDs.normalRDD(sc, num_bigrams, seed=seed+i).map(lambda val: str(-1) if val < 0 else str(1)).collect() #nRDD= RandomRDDs.uniformRDD(sc, num_bigrams,0,seed+i).map(lambda val: str(-1) if val <= 0.5 else str(1)).collect() #Str to float nRDD = RandomRDDs.uniformRDD(sc, num_bigrams, 0, seed + i).map( lambda val: float(-1) if val <= 0.5 else float(1)).collect() j = nRDD coeff[i] = j return coeff
def find_best_params(X, y): spark, sc = __start_session() param_size = 100 sample_size = 500 parallelism = sc.defaultParallelism train_numpy = np.concatenate((X, y[:, np.newaxis]), axis=1) train = sc.parallelize(train_numpy) \ .map(lambda r: [Vectors.dense(r[0]),float(r[1])]) \ .toDF(['features','label']) \ .repartition(parallelism) \ .cache() reg_param = RandomRDDs.uniformRDD(sc, size=param_size, seed=settings.seed) \ .map(lambda x: 0.001 + (0.1 - 0.001) * x) \ .collect() max_iter = RandomRDDs.uniformRDD(sc, size=param_size, seed=settings.seed) \ .map(lambda x: int(5 + (20 - 5) * x)) \ .collect() # create random grid estimator = LinearRegression(solver='normal') param_grid = ParamGridBuilder().addGrid(estimator.regParam, reg_param) \ .addGrid(estimator.maxIter, max_iter) \ .build() param_grid = sc.parallelize(param_grid) \ .takeSample(withReplacement=False, num=sample_size, seed=settings.seed) best_params = __run_search(estimator, param_grid, train, parallelism) train.unpersist() spark.stop() # print results print('Best Params:', best_params) return best_params
def test_to_java_object_rdd(self): # SPARK-6660 data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0L) self.assertEqual(_to_java_object_rdd(data).count(), 10)
def test_to_java_object_rdd(self): # SPARK-6660 data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0) self.assertEqual(_to_java_object_rdd(data).count(), 10)
def gen_poly(x): x1 = float(x[0]) x2 = float(np.power(x1,2)) x3 = float(np.power(x1,3)) X = Vectors.dense(x1,x2,x3) epsilon = float(x[1]) y = 4.0 + 6.0*x1 + 2.0*x2 - 1*x3 + epsilon return(y,X) def gen_df_ml(sc=None, B=(4,6,2,-1), n=10000, rng=(-2,4), err=(0,4)): sc=get_sc() if sc is None else sc x1 = RandomRDDs.uniformRDD(spark, 10000).map(lambda x: np.diff(rng)*x+np.min(rng)) epsilon = RandomRDDs.normalRDD(spark, 10000).map(lambda x: err[0]+err[1]*x) dat_df = x1.zip(epsilon).map(gen_poly) return(dat_df) gen_df_sparklyr <- function(sc=get_sc(), B=c(4,6,2,-1), n=10000, rng=c(-2,4), err=c(0,4)) { dat<-gen_dat_r(B,n,rng,err) return(copy_to(sc, gen_dat_r(),"df",TRUE)) }
# ## Generate Spark DataFrame Data # We'll generate sample data for a multivariate linear regression with known coefficients and randomly generated error. Specifically; # $$ y = \beta_0 + \sum_i (\beta_i x_i) + \epsilon \thinspace \thinspace \thinspace \thinspace \thinspace \forall i \in {1..3}$$ # $$ \beta_0: 4 $$ # $$ \beta_1: 6 $$ # $$ \beta_2: 2 $$ # $$ \beta_3: -1 $$ from pyspark.mllib.random import RandomRDDs from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint import pandas as pd import numpy as np x1 = RandomRDDs.uniformRDD(spark, 10000).map(lambda x: 6.0 * x - 2) epsilon = RandomRDDs.normalRDD(spark, 10000).map(lambda x: 0.04 * x) def gen_poly(x): x0 = 1.0 x1 = float(x[0]) x2 = float(np.power(x1, 2)) x3 = float(np.power(x1, 3)) X = Vectors.dense(x0, x1, x2, x3) epsilon = float(x[1]) y = 4.0 + 6.0 * x1 + 2.0 * x2 - 1 * x3 + epsilon return (y, X) gen_dat = x1.zip(epsilon).map(gen_poly)
from pyspark.mllib.random import RandomRDDs from pyspark import SparkContext from pyspark.sql import Row, SparkSession import pyspark.sql.functions as f from pyspark.sql.functions import coalesce sc = SparkContext() spark = SparkSession(sc) Rdd = RandomRDDs.uniformRDD(sc, 100044, 2) Rdd2 = RandomRDDs.uniformRDD(sc, 100044, 2) Rdd_cons = Rdd.map(lambda x: 102.83547008547009 + 102.85047727 * x) Rdd_cons = Rdd_cons.sortBy(lambda x: x) Rdd_pop = Rdd2.map(lambda x: 3401 + 150000 * x) Rdd_pop = Rdd_pop.sortBy(lambda x: x) Rdd_pop = Rdd_pop.map(lambda x: int(x + 6071639)) mois = [] for i in range(100044): mois.append(i + 1) Rdd_mois = sc.parallelize(mois, 2) colone1 = Row("consomation") colone2 = Row("population") colone3 = Row("mois") df_cons = Rdd_cons.map(colone1).toDF() df_pop = Rdd_pop.map(colone2).toDF() df_mois = Rdd_mois.map(colone3).toDF() df_mois = df_mois.withColumn('ligne_id', f.monotonically_increasing_id()) df_pop = df_pop.withColumn('ligne_id', f.monotonically_increasing_id()) df_cons = df_cons.withColumn('ligne_id', f.monotonically_increasing_id()) df = df_mois.join(df_pop, on=["ligne_id"]).sort("ligne_id") df = df.join(df_cons, on=["ligne_id"]).sort("ligne_id")
""" Testing with Random data generation https://spark.apache.org/docs/latest/mllib-statistics.html """ from pyspark.mllib.random import RandomRDDs from pyspark import SparkContext sc = SparkContext("local", "Rubbish") # Generate a random double RDD that contains 1 million i.i.d. values drawn from the # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. u = RandomRDDs.uniformRDD(sc, 1000000L, 10) # Apply a transform to get a random double RDD following `N(1, 4)`. v = u.map(lambda x: 1.0 + 2.0 * x) print v
def genTS(k, startk, tslen, s): random.seed(s) freecoefs = [random.random() for i in range(tslen/2 - 1 - k)] w = [1 for i in range(k)] bw = (tslen*tslen/2 - sum(freecoefs)) / sum(w) coefs = np.sqrt(np.concatenate( (freecoefs[:startk], bw*np.array(w), freecoefs[startk:]) )) coefs_all = np.concatenate(([0.], coefs, [(random.random()-0.5) * 2], coefs[::-1])) angles = np.array([(random.random()-0.5) * 2*np.pi for i in range(tslen/2 - 1)]) angles_all = np.concatenate(([0.], angles, [0.], -angles[::-1])) tsft = [r * np.exp(1j * fi) for r, fi in zip(coefs_all, angles_all)] ts = np.fft.ifft(tsft).real + np.array([random.gauss(0, 1)/4 for i in range(tslen)]) return ts from pyspark import SparkConf, SparkContext from pyspark.mllib.random import RandomRDDs conf = SparkConf() sc = SparkContext(conf = conf) RandomRDDs.uniformRDD(sc, tsnum).zipWithIndex() \ .map( lambda (r, i): (i, genTS(k, startk, tslen, int(r*1000) + (i+1)*1000)) ) \ .map( lambda (i, ts): str(i) + ',' + ','.join(["%.5f" % x for x in ts]) ) \ .saveAsTextFile(outfile)