Exemplo n.º 1
0
def housing_data_generator(start=0, n_points=1000000, n_jobs=4):

    # step = int(math.ceil(n_points/n_jobs))

    # Parallel(n_jobs=n_jobs)(delayed(house_data)(i, i+step) for i in range(start, n_points, step))

    from pyspark.mllib.random import RandomRDDs
    from pyspark.sql import SparkSession
    from pyspark.sql import DataFrame
    from pyspark.sql import SQLContext, Row

    import math
    scSpark = SparkSession \
        .builder \
        .appName("reading csv") \
        .getOrCreate()

    columns = ["id", "size", "loc", "rooms", "bathrooms", "year", "price"]

    u = RandomRDDs.uniformRDD(scSpark, n_points, 2).map(lambda x: math.ceil(
        house_size[0] + (house_size[-1] - house_size[0]) * x)).zipWithIndex()
    u = scSpark.createDataFrame(u, ["size", "id"])

    for col in columns[2:]:
        v = RandomRDDs.uniformRDD(
            scSpark, n_points, 2).map(lambda x: math.ceil(house_size[0] + (
                house_size[-1] - house_size[0]) * x)).zipWithIndex()
        v = scSpark.createDataFrame(v, [col, "id"])
        u = u.join(v, "id").select("*")

        u.show()
Exemplo n.º 2
0
def gen_hash_coeffs_1(num_buckets, num_bigrams, seed):
    coeff = [0 for j in xrange(num_buckets)]
    for i in xrange(num_buckets):
        #nRDD= RandomRDDs.normalRDD(sc, num_bigrams, seed=seed+i).map(lambda val: str(-1) if val < 0 else str(1)).collect()
        #nRDD= RandomRDDs.uniformRDD(sc, num_bigrams,0,seed+i).map(lambda val: str(-1) if val <= 0.5 else str(1)).collect()
        #Str to float
        nRDD = RandomRDDs.uniformRDD(sc, num_bigrams, 0, seed + i).map(
            lambda val: float(-1) if val <= 0.5 else float(1)).collect()
        j = nRDD
        coeff[i] = j
    return coeff
Exemplo n.º 3
0
def find_best_params(X, y):
    spark, sc = __start_session()

    param_size = 100
    sample_size = 500
    parallelism = sc.defaultParallelism

    train_numpy = np.concatenate((X, y[:, np.newaxis]), axis=1)
    train = sc.parallelize(train_numpy) \
              .map(lambda r: [Vectors.dense(r[0]),float(r[1])]) \
              .toDF(['features','label']) \
              .repartition(parallelism) \
              .cache()

    reg_param = RandomRDDs.uniformRDD(sc, size=param_size, seed=settings.seed) \
                          .map(lambda x: 0.001 + (0.1 - 0.001) * x) \
                          .collect()

    max_iter = RandomRDDs.uniformRDD(sc, size=param_size, seed=settings.seed) \
                          .map(lambda x: int(5 + (20 - 5) * x)) \
                          .collect()

    # create random grid
    estimator = LinearRegression(solver='normal')
    param_grid = ParamGridBuilder().addGrid(estimator.regParam, reg_param) \
                                   .addGrid(estimator.maxIter, max_iter) \
                                   .build()

    param_grid = sc.parallelize(param_grid) \
                    .takeSample(withReplacement=False, num=sample_size, seed=settings.seed)

    best_params = __run_search(estimator, param_grid, train, parallelism)

    train.unpersist()
    spark.stop()

    # print results
    print('Best Params:', best_params)

    return best_params
Exemplo n.º 4
0
 def test_to_java_object_rdd(self):  # SPARK-6660
     data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0L)
     self.assertEqual(_to_java_object_rdd(data).count(), 10)
Exemplo n.º 5
0
 def test_to_java_object_rdd(self):  # SPARK-6660
     data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0)
     self.assertEqual(_to_java_object_rdd(data).count(), 10)
Exemplo n.º 6
0
def gen_poly(x):
    x1 = float(x[0])
    x2 = float(np.power(x1,2))
    x3 = float(np.power(x1,3))
    X  = Vectors.dense(x1,x2,x3)  
    epsilon = float(x[1])
    y  = 4.0 + 6.0*x1 + 2.0*x2 - 1*x3 + epsilon
    return(y,X)

def gen_df_ml(sc=None,
              B=(4,6,2,-1),
              n=10000,
              rng=(-2,4),
              err=(0,4)):
  sc=get_sc() if sc is None else sc
  x1      = RandomRDDs.uniformRDD(spark, 10000).map(lambda x: np.diff(rng)*x+np.min(rng))
  epsilon = RandomRDDs.normalRDD(spark, 10000).map(lambda x: err[0]+err[1]*x)
  dat_df = x1.zip(epsilon).map(gen_poly)
  return(dat_df)



gen_df_sparklyr <- function(sc=get_sc(),
                      B=c(4,6,2,-1),
                      n=10000,
                      rng=c(-2,4),
                      err=c(0,4)) {
  dat<-gen_dat_r(B,n,rng,err)
  return(copy_to(sc, gen_dat_r(),"df",TRUE))
}
Exemplo n.º 7
0
# ## Generate Spark DataFrame Data
# We'll generate sample data for a multivariate linear regression with known coefficients and randomly generated error. Specifically;
# $$ y = \beta_0 + \sum_i (\beta_i x_i) + \epsilon   \thinspace \thinspace \thinspace \thinspace \thinspace     \forall i \in {1..3}$$
# $$ \beta_0: 4 $$
# $$ \beta_1: 6 $$
# $$ \beta_2: 2 $$
# $$ \beta_3: -1 $$

from pyspark.mllib.random import RandomRDDs
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
import pandas as pd
import numpy as np

x1 = RandomRDDs.uniformRDD(spark, 10000).map(lambda x: 6.0 * x - 2)
epsilon = RandomRDDs.normalRDD(spark, 10000).map(lambda x: 0.04 * x)


def gen_poly(x):
    x0 = 1.0
    x1 = float(x[0])
    x2 = float(np.power(x1, 2))
    x3 = float(np.power(x1, 3))
    X = Vectors.dense(x0, x1, x2, x3)
    epsilon = float(x[1])
    y = 4.0 + 6.0 * x1 + 2.0 * x2 - 1 * x3 + epsilon
    return (y, X)


gen_dat = x1.zip(epsilon).map(gen_poly)
Exemplo n.º 8
0
from pyspark.mllib.random import RandomRDDs
from pyspark import SparkContext
from pyspark.sql import Row, SparkSession
import pyspark.sql.functions as f
from pyspark.sql.functions import coalesce

sc = SparkContext()
spark = SparkSession(sc)

Rdd = RandomRDDs.uniformRDD(sc, 100044, 2)
Rdd2 = RandomRDDs.uniformRDD(sc, 100044, 2)
Rdd_cons = Rdd.map(lambda x: 102.83547008547009 + 102.85047727 * x)
Rdd_cons = Rdd_cons.sortBy(lambda x: x)
Rdd_pop = Rdd2.map(lambda x: 3401 + 150000 * x)
Rdd_pop = Rdd_pop.sortBy(lambda x: x)
Rdd_pop = Rdd_pop.map(lambda x: int(x + 6071639))
mois = []
for i in range(100044):
    mois.append(i + 1)
Rdd_mois = sc.parallelize(mois, 2)
colone1 = Row("consomation")
colone2 = Row("population")
colone3 = Row("mois")
df_cons = Rdd_cons.map(colone1).toDF()
df_pop = Rdd_pop.map(colone2).toDF()
df_mois = Rdd_mois.map(colone3).toDF()
df_mois = df_mois.withColumn('ligne_id', f.monotonically_increasing_id())
df_pop = df_pop.withColumn('ligne_id', f.monotonically_increasing_id())
df_cons = df_cons.withColumn('ligne_id', f.monotonically_increasing_id())
df = df_mois.join(df_pop, on=["ligne_id"]).sort("ligne_id")
df = df.join(df_cons, on=["ligne_id"]).sort("ligne_id")
"""

Testing with Random data generation

https://spark.apache.org/docs/latest/mllib-statistics.html

"""

from pyspark.mllib.random import RandomRDDs
from pyspark import SparkContext

sc = SparkContext("local", "Rubbish")

# Generate a random double RDD that contains 1 million i.i.d. values drawn from the
# standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
u = RandomRDDs.uniformRDD(sc, 1000000L, 10)
# Apply a transform to get a random double RDD following `N(1, 4)`.
v = u.map(lambda x: 1.0 + 2.0 * x)

print v
Exemplo n.º 10
0
def genTS(k, startk, tslen, s):
  random.seed(s)
  
  freecoefs = [random.random() for i in range(tslen/2 - 1 - k)]
  w = [1 for i in range(k)]
  bw = (tslen*tslen/2 - sum(freecoefs)) / sum(w)
  
  coefs = np.sqrt(np.concatenate( (freecoefs[:startk], bw*np.array(w), freecoefs[startk:]) ))
  coefs_all = np.concatenate(([0.], coefs, [(random.random()-0.5) * 2], coefs[::-1]))
  
  angles = np.array([(random.random()-0.5) * 2*np.pi for i in range(tslen/2 - 1)])
  angles_all = np.concatenate(([0.], angles, [0.], -angles[::-1]))
  
  tsft = [r * np.exp(1j * fi) for r, fi in zip(coefs_all, angles_all)]
  ts = np.fft.ifft(tsft).real + np.array([random.gauss(0, 1)/4 for i in range(tslen)])
  return ts


from pyspark import SparkConf, SparkContext
from pyspark.mllib.random import RandomRDDs

conf = SparkConf()
sc = SparkContext(conf = conf)

RandomRDDs.uniformRDD(sc, tsnum).zipWithIndex() \
  .map( lambda (r, i): (i, genTS(k, startk, tslen, int(r*1000) + (i+1)*1000)) ) \
  .map( lambda (i, ts): str(i) + ',' + ','.join(["%.5f" % x for x in ts]) ) \
  .saveAsTextFile(outfile)