def test_hash_serializer(self): hash(NoOpSerializer()) hash(UTF8Deserializer()) hash(CPickleSerializer()) hash(MarshalSerializer()) hash(AutoSerializer()) hash(BatchedSerializer(CPickleSerializer())) hash(AutoBatchedSerializer(MarshalSerializer())) hash(PairDeserializer(NoOpSerializer(), UTF8Deserializer())) hash(CartesianDeserializer(NoOpSerializer(), UTF8Deserializer())) hash(CompressedSerializer(CPickleSerializer())) hash(FlattenedValuesSerializer(CPickleSerializer()))
def __init__(self): # Setup PySpark. This is needed until PySpark becomes available on PyPI, # after which we can simply add it to requirements.txt. _setup_pyspark() from pyspark.conf import SparkConf from pyspark.context import SparkContext from pyspark.serializers import MarshalSerializer # Create a temporary .zip lib file for Metis, which will be copied over to # Spark workers so they can unpickle Metis functions and objects. metis_lib_file = tempfile.NamedTemporaryFile(suffix='.zip', delete=False) metis_lib_file.close() _copy_lib_for_spark_workers(metis_lib_file.name) # Also ship the Metis lib file so worker nodes can deserialize Metis # internal data structures. conf = SparkConf() conf.setMaster(app.config['SPARK_MASTER']) conf.setAppName('chronology:metis') parallelism = int(app.config.get('SPARK_PARALLELISM', 0)) if parallelism: conf.set('spark.default.parallelism', parallelism) self.context = SparkContext(conf=conf, pyFiles=[metis_lib_file.name], serializer=MarshalSerializer()) # Delete temporary Metis lib file. os.unlink(metis_lib_file.name) # We'll use this to parallelize fetching events in KronosSource. # The default of 8 is from: # https://spark.apache.org/docs/latest/configuration.html self.parallelism = parallelism or 8
def test_zip_with_different_serializers(self): a = self.sc.parallelize(range(5)) b = self.sc.parallelize(range(100, 105)) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) a = a._reserialize(BatchedSerializer(PickleSerializer(), 2)) b = b._reserialize(MarshalSerializer()) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
def spark_main(): from pyspark import SparkContext, SparkConf from pyspark.serializers import MarshalSerializer prefix = 'bfs' conf = SparkConf().setAppName("CSP").setMaster("local[*]") sc = SparkContext(conf=conf, serializer=MarshalSerializer()) sc.setLogLevel("WARN") # Write the fourth step if it already exists, that's fine! #node_to_file(sc, sc.parallelize(SEP_4), '%s/dim-4/final.visited' % (prefix,)) #discover_prefs_spark(sc, prefix, 5) discover_prefs_spark(sc, prefix, 6)
def test_zip_with_different_serializers(self): a = self.sc.parallelize(range(5)) b = self.sc.parallelize(range(100, 105)) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) a = a._reserialize(BatchedSerializer(CPickleSerializer(), 2)) b = b._reserialize(MarshalSerializer()) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) # regression test for SPARK-4841 path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt") t = self.sc.textFile(path) cnt = t.count() self.assertEqual(cnt, t.zip(t).count()) rdd = t.map(str) self.assertEqual(cnt, t.zip(rdd).count()) # regression test for bug in _reserializer() self.assertEqual(cnt, t.zip(rdd).count())
from pyspark import SparkContext from pyspark.serializers import MarshalSerializer import sys import csv inputfile = sys.argv[1] outputfile = sys.argv[2] sc = SparkContext('local', 'task1', serializer=MarshalSerializer()) infile = sc.textFile(inputfile) m1 = infile.map(lambda x: x.split(',')) #split lines m2 = m1.filter(lambda x: 'userId' not in x) #remove headers m3 = m2.map(lambda x: (int(x[1]), (float(x[2]), 1.0))) #store key -> (value,count) m4 = m3.reduceByKey( lambda x, y: (x[0] + y[0], x[1] + y[1])) # reduce to key -> (total value, total count) #final output m5 = m4.map(lambda x: (x[0], (x[1][0] / x[1][1]))).sortByKey( ascending=True).collect() #map x - > (x , avgrating(x)) with open(outputfile, 'wb') as csv_file: writer = csv.writer(csv_file) writer.writerow(["movieId", "rating_avg"]) for key, value in m5: writer.writerow([key, value])
from pyspark.context import SparkContext from pyspark.serializers import MarshalSerializer sc = SparkContext("local", "serialization app", serializer=MarshalSerializer()) print(sc.parallelize(list(range(1000))).map(lambda x: 2 * x).take(10)) sc.stop()
from pyspark.serializers import MarshalSerializer def main(sc): file = sc.parallelize("a,b,c,c,d,c,f,t,t") print(file.glom().collect()) file = file.repartition(3) print(file.toDebugString()) try: sc.dumps(my_object) except: print("Unable to serialize the object", sys.exc_info()[0]) print(file.getNumPartitions()) rdd0 = file.flatMap(lambda x: x.split(",")) rdd1 = rdd0.map(lambda w: (w, 1)) rdd2 = rdd1.reduceByKey(lambda a, b: a + b) print(rdd2.toDebugString()) print(rdd2.collect()) d = rdd2.glom().collect() f = open("E:/b.txt", 'a') f.write(str(d) + '\n') if __name__ == "__main__": conf = SparkConf().setAppName("wordcount")\ .setMaster("local") sc = SparkContext(conf=conf, serializer=MarshalSerializer()) main(sc)
def setup_spark_context(self): # http://spark.apache.org/docs/latest/configuration.html conf = SparkConf().setAll(( ("spark.python.profile", "true" if self.args.profile else "false"), # Protect against memory leaks (which we seem to have at the moment) ("spark.python.worker.reuse", "true" if config["ENV"] in ("ci", ) else "false"), ("spark.ui.enabled", "false" if config["ENV"] in ("ci", ) else "true"), ("spark.task.maxFailures", "10"), ("spark.locality.wait", "10s"), ("spark.locality.wait.node", "10s"), ("spark.locality.wait.process", "10s"), ("spark.locality.wait.rack", "10s"), ("spark.sql.warehouse.dir", "/tmp/spark-warehouse"), # http://deploymentzone.com/2015/12/20/s3a-on-spark-on-aws-ec2/ # https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html ("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"), ("spark.hadoop.fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID", "")), ("spark.hadoop.fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY", "")), ("spark.hadoop.fs.s3a.buffer.dir", "/tmp"), ("spark.hadoop.fs.s3a.connection.maximum", "100"), ("spark.hadoop.fs.s3a.endpoint", "s3-external-1.amazonaws.com"), # us-east-1 # ("spark.hadoop.fs.s3a.fast.upload", "true"), # Buffer directly from memory to S3 ("spark.sql.parquet.mergeSchema", "false"), ("spark.sql.parquet.cacheMetadata", "true"), ("spark.sql.parquet.compression.codec", "gzip"), # snappy, lzo ("spark.hadoop.parquet.enable.summary-metadata", "false"), ("spark.hadoop.parquet.metadata.read.parallelism", "100"), ("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2"), ("spark.hadoop.mapreduce.fileoutputcommitter.marksuccessfuljobs", "true"), ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"), ("spark.speculation", "false"), # ("spark.sql.parquet.enableVectorizedReader", "false") # TODO https://groups.google.com/forum/#!topic/spark-users/YnAlw7dVdQA ? # set("spark.akka.frameSize", "128") )) executor_environment = {"IS_SPARK_EXECUTOR": "1"} if config["ENV"] == "prod": executor_environment.update({ "PYTHONPATH": "/cosr/back", "PYSPARK_PYTHON": "/cosr/back/venv/bin/python", "LD_LIBRARY_PATH": "/usr/local/lib" }) from pyspark.serializers import MarshalSerializer sc = SparkContext(appName=self.name, conf=conf, environment=executor_environment, serializer=MarshalSerializer()) sqlc = SQLContext(sc) if config["ENV"] != "prod": sc.parallelize(list(range(4)), 4).foreach(setup_spark_worker) return sc, sqlc