def main(): if len(sys.argv) == 2: length = sys.argv[1] else: length = 10**3 print("26") data0_1 = RandomRDDs.uniformVectorRDD(sc, length, 3) \ .map(lambda a : a.round(3).tolist()) \ .toDF() print("30") name = "random_data{}.parquet".format(333) # random.randrange(200)) print("using name=" + name) data0_1.write.parquet(name) print("33") read_df = spark.read.parquet(name) # print(f"Read {read_df.count()} records. Should be {length} records.") # for python 3 print("Read {} records. Should be {} records.".format( read_df.count(), length)) # for python 3
def generate_csv_hdfs(spark, row, col, path, num_partition=3): sc = spark.sparkContext rdd = RandomRDDs.uniformVectorRDD(sc, row, col, num_partition) lines = rdd.map(toCSVLine) lines.saveAsTextFile(path)
import sys from pyspark import SparkContext from pyspark.mllib.random import RandomRDDs from math import hypot def dist(p): return hypot(p[0] - 0.5, p[1] - 0.5) sc = SparkContext("local", "Monte Carlo Integration Pi Approximation") num_samples = int(sys.argv[1]) a = RandomRDDs.uniformVectorRDD(sc, num_samples, 2) num = a.map(dist).filter(lambda d: d < 0.5).count() print(4 * num / num_samples)
def generate_random_uniform_df(nrows, ncols): df = RandomRDDs.uniformVectorRDD(spark.sparkContext, nrows, ncols).map(lambda a: a.tolist()).toDF() return df
from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.mllib.random import RandomRDDs from time import time print("########################################") print("STARTING") print("########################################") sc = SparkContext(appName="speedtest-nrb") sql_context = SQLContext(sc) start = time() t = time() print("creating dataframes df and df2") df = RandomRDDs.uniformVectorRDD(sc, 100000000, 2).map(lambda a: a.tolist()).toDF() df2 = RandomRDDs.uniformVectorRDD(sc, 100000000, 2).map(lambda a: a.tolist()).toDF() print(time() - t) t = time() print("counting df") df.count() print(time() - t) t = time() print("counting df") df.count() print(time() - t) t = time()
def generate_spark_matrix(nrows: int, ncols: int, spark): df = RandomRDDs.uniformVectorRDD(spark.sparkContext, nrows, ncols).map(lambda a: a.tolist())\ .toDF()\ .repartition(int(nrows/partition_factor))\ .persist() return df
from pyspark import SparkContext from pyspark.mllib.random import RandomRDDs from math import hypot import sys sc = SparkContext() # Project Euler Problem 1 print (sc.range(1000).filter(lambda candidate: candidate%3==0 or candidate%5==0).sum()) # Approximating Pi using Monte Carlo integration radius = 1 def dist(p): return hypot(p[0], p[1]) num_samples = int(sys.argv[1]) unit_square = RandomRDDs.uniformVectorRDD(sc, num_samples, 2) hit = unit_square.map(dist).filter(lambda d: d < radius).count() fraction = hit / num_samples print (fraction * (2*radius)**2)