示例#1
0
def main():
    if len(sys.argv) == 2:
        length = sys.argv[1]
    else:
        length = 10**3
    print("26")
    data0_1  = RandomRDDs.uniformVectorRDD(sc, length, 3) \
        .map(lambda a : a.round(3).tolist()) \
            .toDF()
    print("30")
    name = "random_data{}.parquet".format(333)  # random.randrange(200))
    print("using name=" + name)
    data0_1.write.parquet(name)
    print("33")

    read_df = spark.read.parquet(name)
    #    print(f"Read {read_df.count()} records. Should be {length} records.") # for python 3
    print("Read {} records. Should be {} records.".format(
        read_df.count(), length))  # for python 3
示例#2
0
def generate_csv_hdfs(spark, row, col, path, num_partition=3):
    sc = spark.sparkContext
    rdd = RandomRDDs.uniformVectorRDD(sc, row, col, num_partition)
    lines = rdd.map(toCSVLine)
    lines.saveAsTextFile(path)
示例#3
0
文件: pi.py 项目: yzyz/spark-warmups
import sys
from pyspark import SparkContext
from pyspark.mllib.random import RandomRDDs
from math import hypot


def dist(p):
    return hypot(p[0] - 0.5, p[1] - 0.5)


sc = SparkContext("local", "Monte Carlo Integration Pi Approximation")

num_samples = int(sys.argv[1])

a = RandomRDDs.uniformVectorRDD(sc, num_samples, 2)
num = a.map(dist).filter(lambda d: d < 0.5).count()

print(4 * num / num_samples)
示例#4
0
def generate_random_uniform_df(nrows, ncols):
    df = RandomRDDs.uniformVectorRDD(spark.sparkContext, nrows,
                                     ncols).map(lambda a: a.tolist()).toDF()
    return df
示例#5
0
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.mllib.random import RandomRDDs
from time import time

print("########################################")
print("STARTING")
print("########################################")

sc = SparkContext(appName="speedtest-nrb")
sql_context = SQLContext(sc)
start = time()

t = time()
print("creating dataframes df and df2")
df = RandomRDDs.uniformVectorRDD(sc, 100000000,
                                 2).map(lambda a: a.tolist()).toDF()
df2 = RandomRDDs.uniformVectorRDD(sc, 100000000,
                                  2).map(lambda a: a.tolist()).toDF()
print(time() - t)

t = time()
print("counting df")
df.count()
print(time() - t)

t = time()
print("counting df")
df.count()
print(time() - t)

t = time()
示例#6
0
def generate_spark_matrix(nrows: int, ncols: int, spark):
    df = RandomRDDs.uniformVectorRDD(spark.sparkContext, nrows, ncols).map(lambda a: a.tolist())\
        .toDF()\
        .repartition(int(nrows/partition_factor))\
        .persist()
    return df
示例#7
0
from pyspark import SparkContext
from pyspark.mllib.random import RandomRDDs
from math import hypot
import sys

sc = SparkContext()

# Project Euler Problem 1

print (sc.range(1000).filter(lambda candidate: candidate%3==0 or candidate%5==0).sum())

# Approximating Pi using Monte Carlo integration

radius = 1
def dist(p):
	return hypot(p[0], p[1])

num_samples = int(sys.argv[1])
unit_square = RandomRDDs.uniformVectorRDD(sc, num_samples, 2)
hit = unit_square.map(dist).filter(lambda d: d < radius).count()
fraction = hit / num_samples

print (fraction * (2*radius)**2)