示例#1
0
 def test_hash_serializer(self):
     hash(NoOpSerializer())
     hash(UTF8Deserializer())
     hash(CPickleSerializer())
     hash(MarshalSerializer())
     hash(AutoSerializer())
     hash(BatchedSerializer(CPickleSerializer()))
     hash(AutoBatchedSerializer(MarshalSerializer()))
     hash(PairDeserializer(NoOpSerializer(), UTF8Deserializer()))
     hash(CartesianDeserializer(NoOpSerializer(), UTF8Deserializer()))
     hash(CompressedSerializer(CPickleSerializer()))
     hash(FlattenedValuesSerializer(CPickleSerializer()))
示例#2
0
    def __init__(self):
        # Setup PySpark. This is needed until PySpark becomes available on PyPI,
        # after which we can simply add it to requirements.txt.
        _setup_pyspark()
        from pyspark.conf import SparkConf
        from pyspark.context import SparkContext
        from pyspark.serializers import MarshalSerializer

        # Create a temporary .zip lib file for Metis, which will be copied over to
        # Spark workers so they can unpickle Metis functions and objects.
        metis_lib_file = tempfile.NamedTemporaryFile(suffix='.zip',
                                                     delete=False)
        metis_lib_file.close()
        _copy_lib_for_spark_workers(metis_lib_file.name)

        # Also ship the Metis lib file so worker nodes can deserialize Metis
        # internal data structures.
        conf = SparkConf()
        conf.setMaster(app.config['SPARK_MASTER'])
        conf.setAppName('chronology:metis')
        parallelism = int(app.config.get('SPARK_PARALLELISM', 0))
        if parallelism:
            conf.set('spark.default.parallelism', parallelism)
        self.context = SparkContext(conf=conf,
                                    pyFiles=[metis_lib_file.name],
                                    serializer=MarshalSerializer())

        # Delete temporary Metis lib file.
        os.unlink(metis_lib_file.name)

        # We'll use this to parallelize fetching events in KronosSource.
        # The default of 8 is from:
        # https://spark.apache.org/docs/latest/configuration.html
        self.parallelism = parallelism or 8
示例#3
0
 def test_zip_with_different_serializers(self):
     a = self.sc.parallelize(range(5))
     b = self.sc.parallelize(range(100, 105))
     self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
     a = a._reserialize(BatchedSerializer(PickleSerializer(), 2))
     b = b._reserialize(MarshalSerializer())
     self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
示例#4
0
def spark_main():
    from pyspark import SparkContext, SparkConf

    from pyspark.serializers import MarshalSerializer

    prefix = 'bfs'

    conf = SparkConf().setAppName("CSP").setMaster("local[*]")
    sc = SparkContext(conf=conf, serializer=MarshalSerializer())
    sc.setLogLevel("WARN")

    # Write the fourth step if it already exists, that's fine!
    #node_to_file(sc, sc.parallelize(SEP_4), '%s/dim-4/final.visited' % (prefix,))
    #discover_prefs_spark(sc, prefix, 5)
    discover_prefs_spark(sc, prefix, 6)
示例#5
0
 def test_zip_with_different_serializers(self):
     a = self.sc.parallelize(range(5))
     b = self.sc.parallelize(range(100, 105))
     self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
     a = a._reserialize(BatchedSerializer(CPickleSerializer(), 2))
     b = b._reserialize(MarshalSerializer())
     self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
     # regression test for SPARK-4841
     path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt")
     t = self.sc.textFile(path)
     cnt = t.count()
     self.assertEqual(cnt, t.zip(t).count())
     rdd = t.map(str)
     self.assertEqual(cnt, t.zip(rdd).count())
     # regression test for bug in _reserializer()
     self.assertEqual(cnt, t.zip(rdd).count())
示例#6
0
from pyspark import SparkContext
from pyspark.serializers import MarshalSerializer
import sys
import csv

inputfile = sys.argv[1]
outputfile = sys.argv[2]

sc = SparkContext('local', 'task1', serializer=MarshalSerializer())

infile = sc.textFile(inputfile)

m1 = infile.map(lambda x: x.split(','))  #split lines
m2 = m1.filter(lambda x: 'userId' not in x)  #remove headers
m3 = m2.map(lambda x: (int(x[1]),
                       (float(x[2]), 1.0)))  #store key -> (value,count)
m4 = m3.reduceByKey(
    lambda x, y:
    (x[0] + y[0], x[1] + y[1]))  # reduce to key -> (total value, total count)

#final output
m5 = m4.map(lambda x: (x[0], (x[1][0] / x[1][1]))).sortByKey(
    ascending=True).collect()  #map x - > (x , avgrating(x))

with open(outputfile, 'wb') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["movieId", "rating_avg"])
    for key, value in m5:
        writer.writerow([key, value])
示例#7
0
from pyspark.context import SparkContext
from pyspark.serializers import MarshalSerializer
sc = SparkContext("local", "serialization app", serializer=MarshalSerializer())
print(sc.parallelize(list(range(1000))).map(lambda x: 2 * x).take(10))
sc.stop()
示例#8
0
from pyspark.serializers import MarshalSerializer


def main(sc):
    file = sc.parallelize("a,b,c,c,d,c,f,t,t")
    print(file.glom().collect())
    file = file.repartition(3)
    print(file.toDebugString())
    try:
        sc.dumps(my_object)
    except:
        print("Unable to serialize the object", sys.exc_info()[0])
    print(file.getNumPartitions())
    rdd0 = file.flatMap(lambda x: x.split(","))
    rdd1 = rdd0.map(lambda w: (w, 1))
    rdd2 = rdd1.reduceByKey(lambda a, b: a + b)
    print(rdd2.toDebugString())
    print(rdd2.collect())
    d = rdd2.glom().collect()
    f = open("E:/b.txt", 'a')
    f.write(str(d) + '\n')


if __name__ == "__main__":


    conf = SparkConf().setAppName("wordcount")\
        .setMaster("local")
    sc = SparkContext(conf=conf, serializer=MarshalSerializer())
    main(sc)
示例#9
0
    def setup_spark_context(self):

        # http://spark.apache.org/docs/latest/configuration.html
        conf = SparkConf().setAll((
            ("spark.python.profile", "true" if self.args.profile else "false"),

            # Protect against memory leaks (which we seem to have at the moment)
            ("spark.python.worker.reuse",
             "true" if config["ENV"] in ("ci", ) else "false"),
            ("spark.ui.enabled",
             "false" if config["ENV"] in ("ci", ) else "true"),
            ("spark.task.maxFailures", "10"),
            ("spark.locality.wait", "10s"),
            ("spark.locality.wait.node", "10s"),
            ("spark.locality.wait.process", "10s"),
            ("spark.locality.wait.rack", "10s"),
            ("spark.sql.warehouse.dir", "/tmp/spark-warehouse"),

            # http://deploymentzone.com/2015/12/20/s3a-on-spark-on-aws-ec2/
            # https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html
            ("spark.hadoop.fs.s3a.impl",
             "org.apache.hadoop.fs.s3a.S3AFileSystem"),
            ("spark.hadoop.fs.s3a.access.key",
             os.getenv("AWS_ACCESS_KEY_ID", "")),
            ("spark.hadoop.fs.s3a.secret.key",
             os.getenv("AWS_SECRET_ACCESS_KEY", "")),
            ("spark.hadoop.fs.s3a.buffer.dir", "/tmp"),
            ("spark.hadoop.fs.s3a.connection.maximum", "100"),
            ("spark.hadoop.fs.s3a.endpoint",
             "s3-external-1.amazonaws.com"),  # us-east-1
            # ("spark.hadoop.fs.s3a.fast.upload", "true"),  # Buffer directly from memory to S3
            ("spark.sql.parquet.mergeSchema", "false"),
            ("spark.sql.parquet.cacheMetadata", "true"),
            ("spark.sql.parquet.compression.codec", "gzip"),  # snappy, lzo
            ("spark.hadoop.parquet.enable.summary-metadata", "false"),
            ("spark.hadoop.parquet.metadata.read.parallelism", "100"),
            ("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version",
             "2"),
            ("spark.hadoop.mapreduce.fileoutputcommitter.marksuccessfuljobs",
             "true"),
            ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"),
            ("spark.speculation", "false"),

            # ("spark.sql.parquet.enableVectorizedReader", "false")

            # TODO https://groups.google.com/forum/#!topic/spark-users/YnAlw7dVdQA ?
            # set("spark.akka.frameSize", "128")
        ))

        executor_environment = {"IS_SPARK_EXECUTOR": "1"}
        if config["ENV"] == "prod":
            executor_environment.update({
                "PYTHONPATH": "/cosr/back",
                "PYSPARK_PYTHON": "/cosr/back/venv/bin/python",
                "LD_LIBRARY_PATH": "/usr/local/lib"
            })

        from pyspark.serializers import MarshalSerializer

        sc = SparkContext(appName=self.name,
                          conf=conf,
                          environment=executor_environment,
                          serializer=MarshalSerializer())

        sqlc = SQLContext(sc)

        if config["ENV"] != "prod":
            sc.parallelize(list(range(4)), 4).foreach(setup_spark_worker)

        return sc, sqlc