def SearchTiles_and_Factorize(n): global globalmergedtiles global globalcoordinates global factors_accum global spcon spcon = SparkContext("local[4]","Spark_TileSearch_Optimized") if persisted_tiles == True: tileintervalsf=open("/home/shrinivaasanka/Krishna_iResearch_OpenSource/GitHub/asfer-github-code/cpp-src/miscellaneous/DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.tileintervals","r") tileintervalslist=tileintervalsf.read().split("\n") #print "tileintervalslist=",tileintervalslist tileintervalslist_accum=spcon.accumulator(tilesintervalslist, VectorAccumulatorParam()) paralleltileintervals=spcon.parallelize(tileintervalslist) paralleltileintervals.foreach(tilesearch) else: factorsfile=open("DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.factors","w") hardy_ramanujan_ray_shooting_queries(n) hardy_ramanujan_prime_number_theorem_ray_shooting_queries(n) baker_harman_pintz_ray_shooting_queries(n) cramer_ray_shooting_queries(n) zhang_ray_shooting_queries(n) factors_accum=spcon.accumulator(factors_of_n, FactorsAccumulatorParam()) #spcon.parallelize(xrange(1,n)).foreach(tilesearch_nonpersistent) spcon.parallelize(spcon.range(1,n).collect()).foreach(tilesearch_nonpersistent) print "factors_accum.value = ", factors_accum.value factors=[] factordict={} for f in factors_accum.value: factors += f factordict[n]=factors json.dump(factordict,factorsfile) return factors
from __future__ import print_function import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext if __name__ == "__main__": sc = SparkContext(appName="PythonStreamingNetworkWordCount") rdd = sc.range(1, 1000) counts = rdd.map(lambda i: i * 2) counts.saveAsTextFile("s3://uryyyyyyy-sandbox/py.log")
from pyspark import SparkContext from pyspark.sql import SQLContext # setup spark context from pyspark.sql.types import StructType, StructField, StringType sc = SparkContext("local", "data_processor") sqlC = SQLContext(sc) # create dummy data frames rdd1 = sc.range(0,10000000).map(lambda x: ("key "+str(x), x)).repartition(100) rdd2 = sc.range(0,10000).map(lambda x: ("key "+str(x), x)).repartition(10) # Define schema schema = StructType([ StructField("Id", StringType(), True), StructField("Packsize", StringType(), True) ]) schema2 = StructType([ StructField("Id2", StringType(), True), StructField("Packsize", StringType(), True) ]) df1 = sqlC.createDataFrame(rdd1,schema) df2 = sqlC.createDataFrame(rdd2,schema2) print df1.rdd.getNumPartitions() print df2.rdd.getNumPartitions()
from pyspark import SparkContext import os os.environ['SPARK_HOME']='F:/hadoop/spark-2.3.2-bin-hadoop2.7' os.environ['PYSPARK_PYTHON']='D:/ProgramData/Anaconda3/envs/tfColne/python.exe' if __name__ == '__main__': sc = SparkContext('local', 'test') test_data = sc.range(0, 100) td = test_data.map(lambda d: (d % 5, d)) ts = td.reduceByKey(lambda a,b: a + b) dt = sc.range(10) dm = dt.map(lambda d: (d % 5, d)) dg = dm.reduceByKey(lambda a, b: (a, b)) ds = dg.sortByKey(ascending=False) print(ts.collect()) print(ds.collect()) print(ts.join(ds).collect()) broad = sc.parallelize([1, 8, 2, 4]) acc = sc.accumulator(0) broad.foreach(lambda x: acc.add(x)) print(acc.value)
df = changedTypedf.withColumn( "Literal", F.lit(0) ) df.withColumn("meta", df.setosa_dbl * 2).show() df.withColumn("meta", df['setosa_dbl'] * 2).show() changedTypedf['mittens'] = changedTypedf['setosa_dbl'] * 10 changedTypedf.sum('setosa_dbl') df_header.take(2) df_header.head(4) df_header.head(4) # Displays the content of the DataFrame to stdout df.show() df.assign result_pdf = df.select("*").toPandas() df.toPandas() result_pdf.dtypes df.write.parquet("output/proto.parquet") data = spark_context.range(0, 5) data.write.format("delta").save("/tmp/delta-table")
int(x[0]) - 1, 1.0)) degrees = upper_entries.map(lambda entry: (entry.i, entry.value)).reduceByKey( lambda a, b: a + b) W = CoordinateMatrix(upper_entries.union(lower_entries), numCols=N, numRows=N) # XXX: laplacian = sys.argv[1] if laplacian == 'unnormalized': entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], x[1])) D = CoordinateMatrix(entries, numCols=N, numRows=N) L = D.toBlockMatrix().subtract(W.toBlockMatrix()).toCoordinateMatrix() elif laplacian == 'normalized': entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], 1 / x[1])) D_inv = CoordinateMatrix(entries, numCols=N, numRows=N).toBlockMatrix() I = CoordinateMatrix(sc.range(N).map(lambda i: MatrixEntry(i, i, 1.0)), numCols=N, numRows=N).toBlockMatrix() L = I.subtract(D_inv.multiply(W.toBlockMatrix())).toCoordinateMatrix() elif laplacian == 'symmetric': entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], 1 / sqrt(x[1]))) D_invsq = CoordinateMatrix(entries, numCols=N, numRows=N).toBlockMatrix() I = sc.range(N).map(lambda i: MatrixEntry(i, i, 1.0), N, N) tmp = D_invsq.multiply(W.toBlockMatrix()).multiply(D_invsq) L = I.toBlockMatrix().subtract(tmp) else: raise ValueError('Unknown type of Laplacian.') ## SVD, and transform from dense matrix to dataframe. svd = L.toRowMatrix().computeSVD(k=K, computeU=False) V = svd.V.toArray().tolist()
nexecutors = 4 print( 'Cannot determine number of executors, using default value {}'.format( nexecutors)) print('\n') def prox_method(x): from mod import spark_method return spark_method(x) from time import time start = time() results = sc.range(nexecutors, numSlices=nexecutors).map(prox_method).collect() print('Results:') pp.pprint(results) print('Result length: {}'.format(len(results))) print('Duration: {:.2f} s'.format(time() - start)) #print('\nUnique results:') #print(set(results)) #nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 20]) #print(nums.collect()) #sumAndCount = nums.map(lambda x: (x, 1)).fold((0, 0), (lambda x, y: (x[0] + y[0], x[1] + y[1]))) #print(sumAndCount)
from pyspark import SparkContext from pyspark.mllib.random import RandomRDDs from math import hypot import sys sc = SparkContext() # Project Euler Problem 1 print (sc.range(1000).filter(lambda candidate: candidate%3==0 or candidate%5==0).sum()) # Approximating Pi using Monte Carlo integration radius = 1 def dist(p): return hypot(p[0], p[1]) num_samples = int(sys.argv[1]) unit_square = RandomRDDs.uniformVectorRDD(sc, num_samples, 2) hit = unit_square.map(dist).filter(lambda d: d < radius).count() fraction = hit / num_samples print (fraction * (2*radius)**2)
#!/usr/bin/env python3 #-*- coding: utf-8 -*- import sys from operator import add from random import random from pyspark import SparkContext def f(_): x = random() * 2 - 1 y = random() * 2 - 1 return 1 if x**2 + y**2 < 1 else 0 if __name__ == "__main__": """ Usage: pi [partitions] """ sc = SparkContext(appName="PySpark_Pi") partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2 n = 100000 * partitions count = sc.parallelize(sc.range(1, n + 1), partitions).map(f).reduce(add) print("Pi is roughly ", 4.0 * count / n) sc.stop()
# The plastic number, $\rho$, is the unique real solution to # the cubic equation $x^3 = x + 1$. # We use it for generating a quasi-random sequence in 2D for # the initial set of complex numbers later. rho = ((9 + m.sqrt(69)) / 18)**(1 / 3) + ((9 - m.sqrt(69)) / 18)**(1 / 3) # Obtain the Spark Context sc = SparkContext(conf=SparkConf()) print(sc) # Print configurations. print(sc.getConf().getAll()) # Create an RDD with 64 elements. n_rdd = sc.range(1, 2**6 + 1) # Print how the RDD gets mapped. print( n_rdd.map(lambda x: (socket.gethostname(), os.getppid(), os.getpid())). distinct().collect()) # Mark for cache() in memory points_rdd = n_rdd.map(lambda n: (n / rho % 1) / 10 + (n / (rho * rho) % 1) / 10 * 1j).cache() # Print the total number of partitions print(points_rdd.getNumPartitions()) # Print the number of elements in each partition print(points_rdd.glom().map(len).collect())
""" Purpose: Date created: 2020-04-19 Contributor(s): Mark M. """ from __future__ import print_function from pyspark import SparkContext # from pyspark.sql import SparkSession # spark = SparkSession.builder.appName("test1").getOrCreate() sc = SparkContext(appName="matrices1") rdd = sc.parallelize([ 1, 2, ]) sorted(rdd.cartesian(rdd).collect()) n = 10 rng = sc.range(1, n + 1) sum_ = rng.sum() print(f"The sum of the numbers from 1 to 10 is: {sum_}") sc.stop()