def __init__(self, loadDefaults=True, _jvm=None): super(SparkConf, self).__init__() self.arg = arg from pyspark.context import SparkContext SparkContext._ensure_initialized() _jvm = _jvm or SparkContext._jvm self._jconf = _jvm.SparkConf(loadDefaults)
def _test(): import doctest from array import array from pyspark.context import SparkContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext("local[4]", "PythonTest", batchSize=2) globs["sc"] = sc globs["sqlCtx"] = SQLContext(sc) globs["rdd"] = sc.parallelize( [{"field1": 1, "field2": "row1"}, {"field1": 2, "field2": "row2"}, {"field1": 3, "field2": "row3"}] ) globs["nestedRdd1"] = sc.parallelize( [{"f1": array("i", [1, 2]), "f2": {"row1": 1.0}}, {"f1": array("i", [2, 3]), "f2": {"row2": 2.0}}] ) globs["nestedRdd2"] = sc.parallelize( [ {"f1": [[1, 2], [2, 3]], "f2": set([1, 2]), "f3": (1, 2)}, {"f1": [[2, 3], [3, 4]], "f2": set([2, 3]), "f3": (2, 3)}, ] ) (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) globs["sc"].stop() if failure_count: exit(-1)
def _test(): import doctest import os import tempfile import py4j from pyspark.context import SparkContext from pyspark.sql import SparkSession, Row import pyspark.sql.readwriter os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.readwriter.__dict__.copy() sc = SparkContext("local[4]", "PythonTest") try: spark = SparkSession.builder.enableHiveSupport().getOrCreate() except py4j.protocol.Py4JError: spark = SparkSession(sc) globs["tempfile"] = tempfile globs["os"] = os globs["sc"] = sc globs["spark"] = spark globs["df"] = spark.read.parquet("python/test_support/sql/parquet_partitioned") (failure_count, test_count) = doctest.testmod( pyspark.sql.readwriter, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF, ) sc.stop() if failure_count: exit(-1)
def _test(): import doctest from array import array from pyspark.context import SparkContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext('local[4]', 'PythonTest', batchSize=2) globs['sc'] = sc globs['sqlCtx'] = SQLContext(sc) globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"}, {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}]) jsonStrings = ['{"field1": 1, "field2": "row1", "field3":{"field4":11}}', '{"field1" : 2, "field2": "row2", "field3":{"field4":22}}', '{"field1" : 3, "field2": "row3", "field3":{"field4":33}}'] globs['jsonStrings'] = jsonStrings globs['json'] = sc.parallelize(jsonStrings) globs['nestedRdd1'] = sc.parallelize([ {"f1" : array('i', [1, 2]), "f2" : {"row1" : 1.0}}, {"f1" : array('i', [2, 3]), "f2" : {"row2" : 2.0}}]) globs['nestedRdd2'] = sc.parallelize([ {"f1" : [[1, 2], [2, 3]], "f2" : set([1, 2]), "f3" : (1, 2)}, {"f1" : [[2, 3], [3, 4]], "f2" : set([2, 3]), "f3" : (2, 3)}]) (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS) globs['sc'].stop() if failure_count: exit(-1)
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port jgws.resetCallbackClient(jgws.getCallbackClient().getAddress(), gw._python_proxy_port) # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing cls._transformerSerializer = TransformFunctionSerializer( SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.dataframe globs = pyspark.sql.dataframe.__dict__.copy() sc = SparkContext("local[4]", "PythonTest") globs["sc"] = sc globs["sqlContext"] = SQLContext(sc) globs["df"] = sc.parallelize([(2, "Alice"), (5, "Bob")]).toDF( StructType([StructField("age", IntegerType()), StructField("name", StringType())]) ) globs["df2"] = sc.parallelize([Row(name="Tom", height=80), Row(name="Bob", height=85)]).toDF() globs["df4"] = sc.parallelize( [ Row(name="Alice", age=10, height=80), Row(name="Bob", age=5, height=None), Row(name="Tom", age=None, height=None), Row(name=None, age=None, height=None), ] ).toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.dataframe, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF, ) globs["sc"].stop() if failure_count: exit(-1)
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.readwriter globs = pyspark.sql.readwriter.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \ .toDF(StructType([StructField('age', IntegerType()), StructField('name', StringType())])) jsonStrings = [ '{"field1": 1, "field2": "row1", "field3":{"field4":11}}', '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},' '"field6":[{"field7": "row2"}]}', '{"field1" : null, "field2": "row3", ' '"field3":{"field4":33, "field5": []}}' ] globs['jsonStrings'] = jsonStrings (failure_count, test_count) = doctest.testmod( pyspark.sql.readwriter, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) globs['sc'].stop() if failure_count: exit(-1)
def _test(): import doctest import os import tempfile import py4j from pyspark.context import SparkContext from pyspark.sql import SparkSession, Row import pyspark.sql.readwriter os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.readwriter.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') try: spark = SparkSession.builder.enableHiveSupport().getOrCreate() except py4j.protocol.Py4JError: spark = SparkSession(sc) globs['tempfile'] = tempfile globs['os'] = os globs['sc'] = sc globs['spark'] = spark globs['df'] = spark.read.parquet('python/test_support/sql/parquet_partitioned') globs['sdf'] = \ spark.read.format('text').stream('python/test_support/sql/streaming') (failure_count, test_count) = doctest.testmod( pyspark.sql.readwriter, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) sc.stop() if failure_count: exit(-1)
def _test(): import os import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.context os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.context.__dict__.copy() sc = SparkContext("local[4]", "PythonTest") globs["sc"] = sc globs["sqlContext"] = SQLContext(sc) globs["rdd"] = rdd = sc.parallelize( [Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3")] ) globs["df"] = rdd.toDF() jsonStrings = [ '{"field1": 1, "field2": "row1", "field3":{"field4":11}}', '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},' '"field6":[{"field7": "row2"}]}', '{"field1" : null, "field2": "row3", ' '"field3":{"field4":33, "field5": []}}', ] globs["jsonStrings"] = jsonStrings globs["json"] = sc.parallelize(jsonStrings) (failure_count, test_count) = doctest.testmod( pyspark.sql.context, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE ) globs["sc"].stop() if failure_count: exit(-1)
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.group globs = pyspark.sql.group.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \ .toDF(StructType([StructField('age', IntegerType()), StructField('name', StringType())])) globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80), Row(name='Bob', age=5, height=85)]).toDF() globs['df4'] = sc.parallelize([Row(course="dotNET", year=2012, earnings=10000), Row(course="Java", year=2012, earnings=20000), Row(course="dotNET", year=2012, earnings=5000), Row(course="dotNET", year=2013, earnings=48000), Row(course="Java", year=2013, earnings=30000)]).toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.group, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) globs['sc'].stop() if failure_count: exit(-1)
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.dataframe globs = pyspark.sql.dataframe.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')])\ .toDF(StructType([StructField('age', IntegerType()), StructField('name', StringType())])) globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF() globs['df3'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF() globs['df4'] = sc.parallelize([Row(name='Alice', age=10, height=80), Row(name='Bob', age=5, height=None), Row(name='Tom', age=None, height=None), Row(name=None, age=None, height=None)]).toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.dataframe, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) globs['sc'].stop() if failure_count: exit(-1)
class SparkTestingBaseTestCase(unittest2.TestCase): """Basic common test case for Spark. Provides a Spark context as sc. For non local mode testing you can either override sparkMaster or set the enviroment property SPARK_MASTER for non-local mode testing.""" @classmethod def getMaster(cls): return os.getenv('SPARK_MASTER', "local[4]") def setUp(self): """Setup a basic Spark context for testing""" self.sc = SparkContext(self.getMaster()) self.sql_context = HiveContext(self.sc) quiet_py4j() def tearDown(self): """ Tear down the basic panda spark test case. This stops the running context and does a hack to prevent Akka rebinding on the same port. """ self.sc.stop() # To avoid Akka rebinding to the same port, since it doesn't unbind # immediately on shutdown self.sc._jvm.System.clearProperty("spark.driver.port")
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.context globs = pyspark.sql.context.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['rdd'] = rdd = sc.parallelize( [Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3")] ) globs['df'] = rdd.toDF() jsonStrings = [ '{"field1": 1, "field2": "row1", "field3":{"field4":11}}', '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},' '"field6":[{"field7": "row2"}]}', '{"field1" : null, "field2": "row3", ' '"field3":{"field4":33, "field5": []}}' ] globs['jsonStrings'] = jsonStrings globs['json'] = sc.parallelize(jsonStrings) (failure_count, test_count) = doctest.testmod( pyspark.sql.context, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) globs['sc'].stop() if failure_count: exit(-1)
class PyVertexRDDTestCase(unittest.TestCase): """ Test collect, take, count, mapValues, diff, filter, mapVertexPartitions, innerJoin and leftJoin for VertexRDD """ def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") def tearDown(self): self.sc.stop() def collect(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.take(1) self.assertEqual(results, [(3, ("rxin", "student"))]) def take(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) def count(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.count() self.assertEqual(results, 2) def mapValues(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.mapValues(lambda x: x + ":" + x) self.assertEqual(results, [(3, ("rxin:rxin", "student:student")), (7, ("jgonzal:jgonzal", "postdoc:postdoc"))]) def innerJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.innerJoin(vertices1).collect() self.assertEqual(results, []) def leftJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.diff(vertices1) self.assertEqual(results, 2)
class PySparkTestCase(unittest.TestCase): def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ self.sc = SparkContext('local[4]', class_name, batchSize=2) def tearDown(self): self.sc.stop() sys.path = self._old_sys_path
class PySparkTestCase(unittest.TestCase): def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ self.sc = SparkContext("local[4]", class_name, batchSize=2) def tearDown(self): self.sc.stop() sys.path = self._old_sys_path # To avoid Akka rebinding to the same port, since it doesn't unbind # immediately on shutdown self.sc._jvm.System.clearProperty("spark.driver.port")
def __init__(self, loadDefaults=True, _jvm=None): """ Create a new Spark configuration. @param loadDefaults: whether to load values from Java system properties (True by default) @param _jvm: internal parameter used to pass a handle to the Java VM; does not need to be set by users """ from pyspark.context import SparkContext SparkContext._ensure_initialized() _jvm = _jvm or SparkContext._jvm self._jconf = _jvm.SparkConf(loadDefaults)
def __init__(self, millis, _jvm=None): """ Create new Duration. @param millis: milisecond """ self._millis = millis from pyspark.context import SparkContext SparkContext._ensure_initialized() _jvm = _jvm or SparkContext._jvm self._jduration = _jvm.Duration(millis)
class PySparkTestCase(unittest.TestCase): def setUp(self): class_name = self.__class__.__name__ self.sc = SparkContext('local', class_name) self.sc._jvm.System.setProperty("spark.ui.showConsoleProgress", "false") log4j = self.sc._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.FATAL) def tearDown(self): self.sc.stop() # To avoid Akka rebinding to the same port, since it doesn't unbind # immediately on shutdown self.sc._jvm.System.clearProperty("spark.driver.port")
class PySparkTestCase(unittest.TestCase): def setUp(self): class_name = self.__class__.__name__ self.sc = SparkContext('local', class_name) def tearDown(self): self.sc.stop() def test_should_be_able_to_word_count(self): rdd = self.sc.parallelize(["This is a text", "Another text", "More text", "a text"]) result = python_word_count.wordcount(rdd) expected = [('a', 2), ('This', 1), ('text', 4), ('is', 1), ('Another', 1), ('More', 1)] self.assertEquals(expected, result.collect())
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port gw.jvm.PythonDStream.updatePythonGatewayPort(jgws, gw._python_proxy_port) _py4j_cleaner = Py4jCallbackConnectionCleaner(gw) _py4j_cleaner.start() # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing if cls._transformerSerializer is None: transformer_serializer = TransformFunctionSerializer() transformer_serializer.init( SparkContext._active_spark_context, CloudPickleSerializer(), gw) # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever. # (https://github.com/bartdag/py4j/pull/184) # # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when # calling "registerSerializer". If we call "registerSerializer" twice, the second # PythonProxyHandler will override the first one, then the first one will be GCed and # trigger "PythonProxyHandler.finalize". To avoid that, we should not call # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't # be GCed. # # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version. transformer_serializer.gateway.jvm.PythonDStream.registerSerializer( transformer_serializer) cls._transformerSerializer = transformer_serializer else: cls._transformerSerializer.init( SparkContext._active_spark_context, CloudPickleSerializer(), gw)
class PyEdgeRDDTestCase(unittest.TestCase): """ Test collect, take, count, mapValues, filter and innerJoin for EdgeRDD """ def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") def tearDown(self): self.sc.stop() # TODO def collect(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) # TODO def take(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) # TODO def count(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, 2) # TODO def mapValues(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, 2) # TODO def filter(self): return # TODO def innerJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.diff(vertices1) self.assertEqual(results, 2)
def _test(): import doctest from pyspark.context import SparkContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext('local[4]', 'PythonTest', batchSize=2) globs['sc'] = sc globs['sqlCtx'] = SQLContext(sc) globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"}, {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}]) (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS) globs['sc'].stop() if failure_count: exit(-1)
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") from pyspark.java_gateway import ensure_callback_server_started ensure_callback_server_started(gw) # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing cls._transformerSerializer = TransformFunctionSerializer( SparkContext._active_spark_context, CloudPickleSerializer(), gw)
class SparkTestingBaseTestCase(unittest2.TestCase): """Basic common test case for Spark. Provides a Spark context as sc. For non local mode testing you can either override sparkMaster or set the enviroment property SPARK_MASTER for non-local mode testing.""" @classmethod def getMaster(cls): return os.getenv('SPARK_MASTER', "local[4]") def setUp(self): """Setup a basic Spark context for testing""" self.sc = SparkContext(self.getMaster()) quiet_py4j() def tearDown(self): """ Tear down the basic panda spark test case. This stops the running context and does a hack to prevent Akka rebinding on the same port. """ self.sc.stop() # To avoid Akka rebinding to the same port, since it doesn't unbind # immediately on shutdown self.sc._jvm.System.clearProperty("spark.driver.port") def assertRDDEquals(self, expected, result): return self.compareRDD(expected, result) == [] def compareRDD(self, expected, result): expectedKeyed = expected.map(lambda x: (x, 1))\ .reduceByKey(lambda x, y: x + y) resultKeyed = result.map(lambda x: (x, 1))\ .reduceByKey(lambda x, y: x + y) return expectedKeyed.cogroup(resultKeyed)\ .map(lambda x: tuple(map(list, x[1])))\ .filter(lambda x: x[0] != x[1]).take(1) def assertRDDEqualsWithOrder(self, expected, result): return self.compareRDDWithOrder(expected, result) == [] def compareRDDWithOrder(self, expected, result): def indexRDD(rdd): return rdd.zipWithIndex().map(lambda x: (x[1], x[0])) indexExpected = indexRDD(expected) indexResult = indexRDD(result) return indexExpected.cogroup(indexResult)\ .map(lambda x: tuple(map(list, x[1])))\ .filter(lambda x: x[0] != x[1]).take(1)
def getOrCreate(self): """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new one based on the options set in this builder. This method first checks whether there is a valid global default SparkSession, and if yes, return that one. If no valid global default SparkSession exists, the method creates a new SparkSession and assigns the newly created SparkSession as the global default. >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate() >>> s1.conf.get("k1") == "v1" True In case an existing SparkSession is returned, the config options specified in this builder will be applied to the existing SparkSession. >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate() >>> s1.conf.get("k1") == s2.conf.get("k1") True >>> s1.conf.get("k2") == s2.conf.get("k2") True """ with self._lock: from pyspark.context import SparkContext from pyspark.conf import SparkConf session = SparkSession._instantiatedContext if session is None: sparkConf = SparkConf() for key, value in self._options.items(): sparkConf.set(key, value) sc = SparkContext.getOrCreate(sparkConf) session = SparkSession(sc) for key, value in self._options.items(): session.conf.set(key, value) return session
def test_stop_only_streaming_context(self): self.sc = SparkContext(master=self.master, appName=self.appName) self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration) self._addInputStream(self.ssc) self.ssc.start() self.ssc.stop(False) self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
def __init__(self): # Setup PySpark. This is needed until PySpark becomes available on PyPI, # after which we can simply add it to requirements.txt. _setup_pyspark() from pyspark.conf import SparkConf from pyspark.context import SparkContext from pyspark.serializers import MarshalSerializer # Create a temporary .zip lib file for Metis, which will be copied over to # Spark workers so they can unpickle Metis functions and objects. metis_lib_file = tempfile.NamedTemporaryFile(suffix='.zip', delete=False) metis_lib_file.close() _copy_lib_for_spark_workers(metis_lib_file.name) # Also ship the Metis lib file so worker nodes can deserialize Metis # internal data structures. conf = SparkConf() conf.setMaster(app.config['SPARK_MASTER']) conf.setAppName('chronology:metis') parallelism = int(app.config.get('SPARK_PARALLELISM', 0)) if parallelism: conf.set('spark.default.parallelism', parallelism) self.context = SparkContext(conf=conf, pyFiles=[metis_lib_file.name], serializer=MarshalSerializer()) # Delete temporary Metis lib file. os.unlink(metis_lib_file.name) # We'll use this to parallelize fetching events in KronosSource. # The default of 8 is from: # https://spark.apache.org/docs/latest/configuration.html self.parallelism = parallelism or 8
def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") # TODO: decrease duration to speed up tests self.ssc = StreamingContext(self.sc, self.duration)
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.dataframe globs = pyspark.sql.dataframe.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlCtx'] = SQLContext(sc) globs['df'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF() globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.dataframe, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) globs['sc'].stop() if failure_count: exit(-1)
# try again if port unavailable if check == notfound: port += 1 # return the first available port return port # this is the deprecated equivalent of ADD_JARS add_files = None if os.environ.get("ADD_FILES") is not None: add_files = os.environ.get("ADD_FILES").split(',') if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) # setup mesos-based connection conf = (SparkConf().setMaster(os.environ["SPARK_MASTER"])) # set the UI port conf.set("spark.ui.port", ui_get_available_port()) # configure docker containers as executors conf.setSparkHome(os.environ.get("SPARK_HOME")) conf.set("spark.mesos.executor.docker.image", "lab41/spark-mesos-dockerworker-ipython") conf.set("spark.mesos.executor.home", "/usr/local/spark-1.4.1-bin-hadoop2.4") conf.set("spark.executorEnv.MESOS_NATIVE_LIBRARY", "/usr/local/lib/libmesos.so") conf.set("spark.network.timeout", "100")
parser = argparse.ArgumentParser() parser.add_argument('--JOB_DATE', dest='JOB_DATE') parser.add_argument('--S3_BUCKET', dest='S3_BUCKET') parser.add_argument('--REGION', dest='REGION') args = parser.parse_args() print(args) JOB_DATE = args.JOB_DATE S3_BUCKET = args.S3_BUCKET REGION = args.REGION READ_PATH = 'data/' + JOB_DATE S3_READ_PATH = 's3://' + S3_BUCKET + '/' + READ_PATH WRITE_PATH = 'curated/' + JOB_DATE S3_WRITE_PATH = 's3://' + S3_BUCKET + '/' + WRITE_PATH sc = SparkContext.getOrCreate() spark = SparkSession(sc) def does_s3key_exist(bucket, key, ext): s3 = boto3.resource('s3') bucket = s3.Bucket(bucket) objects = bucket.objects.all() FOUND = 0 for object in objects: if object.key.startswith(key) and object.key.endswith(ext): FOUND = 1 return FOUND if does_s3key_exist(S3_BUCKET, READ_PATH, '.csv') == 1:
import pyspark from pyspark.context import SparkContext from pyspark import SparkConf conf = SparkConf().setMaster("local") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") # Load the adjacency list file AdjList1 = sc.textFile("02AdjacencyList.txt") print AdjList1.collect() AdjList2 = AdjList1.map(lambda line: line.split(" ")) print AdjList2.collect() AdjList3 = AdjList2.map(lambda x: (int(x[0]), [int(y) for y in x[1:]])) AdjList3.persist() print AdjList3.collect() nNumOfNodes = AdjList3.count() print "Total Number of nodes" print nNumOfNodes # Initialize each page's rank; since we use mapValues, the resulting RDD will have the same partitioner as links print "Initialization" PageRankValues = AdjList3.mapValues(lambda v: 1 / float(nNumOfNodes)) print PageRankValues.collect() # Run 30 iterations print "Run 30 Iterations" for i in range(1, 30): print "Number of Iterations"
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source dyf_crm_contacts = glueContext.create_dynamic_frame.from_catalog( database='crm_native', table_name='contacts') print('dyf_crm_contacts::schema') dyf_crm_contacts.printSchema() dyf_crm_contacts = dyf_crm_contacts.resolveChoice(specs=[('_key', 'cast:long')]) try: df_flag = spark.read.parquet( "s3a://dts-odin/flag/student_status/user_profile/communication/email.parquet" ) read_from_index = df_flag.collect()[0]['flag'] print('read from index: ', read_from_index) dyf_crm_contacts = Filter.apply( frame=dyf_crm_contacts, f=lambda x: x["_key"] > read_from_index) except: print('read flag file error ') dyf_crm_contacts = dyf_crm_contacts.select_fields( ['_key', 'Id', 'Email', 'Email2']) dy_source_voxy_cache = dyf_crm_contacts.toDF() dy_source_voxy_cache = dy_source_voxy_cache.cache() dyf_crm_contacts = DynamicFrame.fromDF(dy_source_voxy_cache, glueContext, 'dyf_crm_contacts') today = date.today() d4 = today.strftime("%Y-%m-%d") print("d4 =", d4) print('the number of new contacts: ', dyf_crm_contacts.count()) if (dyf_crm_contacts.count() > 0): # print('Chay vao day nhe------------------') # print('dyf_crm_contacts::----------------') # dyf_crm_contacts.printSchema() # try: #--------------------------------------------------------------------------------------------------------------# dyf_crm_contacts = Filter.apply( frame=dyf_crm_contacts, f=lambda x: x["Id"] is not None and x["Id"] != '' and x[ "Email"] is not None and x["Email"] != '') # --------------------------------------------------------------------------------------------------------------# # --------------------------------------------------------------------------------------------------------------# dy_crm_contacts = dyf_crm_contacts.toDF() dy_crm_contacts = dy_crm_contacts.withColumn('communication_type', f.lit(2)) dy_crm_contacts = dy_crm_contacts.withColumn('is_primary', f.lit(0)) dy_crm_contacts = dy_crm_contacts.withColumn('is_deleted', f.lit(0)) dy_crm_contacts = dy_crm_contacts.withColumn('last_update_date', f.lit(d4)) dyf_crm_contacts = DynamicFrame.fromDF(dy_crm_contacts, glueContext, 'dyf_crm_contacts') dyf_crm_contacts = dyf_crm_contacts.resolveChoice( specs=[('last_update_date', 'cast:string')]) applymapping2 = ApplyMapping.apply( frame=dyf_crm_contacts, mappings=[ ("Id", "int", "user_id", "bigint"), ("communication_type", 'int', 'communication_type', 'int'), ("is_primary", 'int', 'is_primary', 'int'), ("is_deleted", 'int', 'is_deleted', 'int'), ("Email", 'string', 'comunication', 'string'), ("last_update_date", 'string', 'last_update_date', 'timestamp') ]) # # resolvechoice2 = ResolveChoice.apply( frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields6 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields2") print('dropnullfields6::schema') dropnullfields6.printSchema() dropnullfields6.show(5) datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields6, catalog_connection="glue_redshift", connection_options={ "dbtable": "user_communication", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/temp/user/communication/fullname/", transformation_ctx="datasink4") dyf_crm_contacts = Filter.apply( frame=dyf_crm_contacts, f=lambda x: x["Email2"] is not None and x["Email2"] != '') applymapping2 = ApplyMapping.apply( frame=dyf_crm_contacts, mappings=[ ("Id", "int", "user_id", "bigint"), ("communication_type", 'int', 'communication_type', 'int'), ("is_primary", 'int', 'is_primary', 'int'), ("is_deleted", 'int', 'is_deleted', 'int'), ("Email2", 'string', 'comunication', 'string'), ("last_update_date", 'string', 'last_update_date', 'timestamp') ]) # # resolvechoice2 = ResolveChoice.apply( frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields6 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields2") datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields6, catalog_connection="glue_redshift", connection_options={ "dbtable": "user_communication", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/temp/user/communication/fullname/", transformation_ctx="datasink4") df_datasource = dyf_crm_contacts.toDF() flag = df_datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet( "s3a://dts-odin/flag/student_status/user_profile/communication/email.parquet", mode="overwrite")
#!/usr/bin/python # -*- coding: utf-8 -*- from datetime import datetime import sys reload(sys) sys.setdefaultencoding('utf-8') import pyspark from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.sql import Row sc = SparkContext('local') spark = SparkSession(sc) bucket = spark._jsc.hadoopConfiguration().get("fs.gs.system.bucket") project = spark._jsc.hadoopConfiguration().get("fs.gs.project.id") todays_date = datetime.strftime(datetime.today(), "%Y-%m-%d-%H-%M-%S") accum = sc.accumulator(0) print "begin to map input" train_set = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/train_set_combine").map(lambda row: row.split("\t")).map(lambda p: Row(uid=p[0], urlid=p[1], ts=p[2], label=p[3])) combine_uinfo = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/data_files_combine_toterm_new/part-00000").map(lambda row: row.split("\t", 1)) print "finish to map input" def process_uinfo(line): if len(line) != 2: return Row(urlid=line, urlinfo="") return Row(urlid=line[0], urlinfo=line[1])
#!/usr/bin/env python # coding: utf-8 # In[ ]: from pyspark.context import SparkContext from pyspark.conf import SparkConf sc = SparkContext.getOrCreate(SparkConf()) import re text_file = sc.textFile("./README.txt") counts = text_file.flatMap(lambda line: line.split(" ")).map(lambda word: (re.sub("[^a-zA-Z\\d]","",word), 1)).reduceByKey(lambda a, b: a + b) counts.saveAsTextFile("./output_python")
from awsglue.dynamicframe import DynamicFrame from awsglue.utils import getResolvedOptions from awsglue.transforms import * from awsglue.context import GlueContext from awsglue.job import Job import sys args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME']) conf = SparkConf() conf.set("spark.sql.parquet.compression.codec", "snappy") conf.set("spark.sql.parquet.writeLegacyFormat", "true") sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) input_file_path = "s3://troy-dwh-external/user_behavior/2016_funnel.csv" df = spark.read.option("header","true")\ .option("inferSchema","true")\ .option("quote","\"")\ .option("escape","\"").csv(input_file_path)
class ContextWrapper(object): def __init__(self): pass def set_context(self, java_gateway): spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper() j_spark_conf = spark_context_wrapper.sparkConf() p_spark_conf = SparkConf(_jvm=java_gateway.jvm, _jconf=j_spark_conf) j_spark_context = spark_context_wrapper.javaContext() self._context = SparkContext(jsc=j_spark_context, gateway=java_gateway, conf=p_spark_conf) def set_sql_context(self, java_gateway): from pyspark.sql import SQLContext spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper() self._sql_context = SQLContext( self._context, sparkSession=spark_context_wrapper.sparkSession(False), jsqlContext=spark_context_wrapper.sqlContext()) def set_hive_context(self, java_gateway): from pyspark.sql import HiveContext spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper() self._hive_context = HiveContext(self._context, spark_context_wrapper.hiveContext()) def set_session(self, java_gateway): from pyspark.sql import SparkSession self._session = SparkSession.builder.config( conf=self._context.getConf()).getOrCreate() def set_hive_session(self, java_gateway): from pyspark.sql import SparkSession self._session = SparkSession.builder.config( conf=self._context.getConf()).enableHiveSupport().getOrCreate() def set_streaming_context(self, java_gateway): from pyspark.streaming import StreamingContext spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper() self._streaming_context = StreamingContext( self._context, java_gateway.entry_point.sparkStreamingWrapper(). getDurationSeconds()) @property def context(self): return self._context @property def sql_context(self): return self._sql_context @property def hive_context(self): return self._hive_context @property def session(self): return self._session @property def streaming_context(self): return self._streaming_context
from pyspark.sql import DataFrame from py4j.java_collections import MapConverter if isinstance(df, DataFrame): intp.saveDFToCsv( df._jdf, path, hasheader, isOverwrite, MapConverter().convert(option, gateway._gateway_client)) else: print(str(df)) java_import(gateway.jvm, "scala.Tuple2") jsc = intp.getJavaSparkContext() jconf = intp.getSparkConf() conf = SparkConf(_jvm=gateway.jvm, _jconf=jconf) sc = SparkContext(jsc=jsc, gateway=gateway, conf=conf) sqlc = HiveContext(sc, intp.sqlContext()) sqlContext = sqlc spark = SparkSession(sc, intp.getSparkSession()) ##add pyfiles try: pyfile = sys.argv[4] pyfiles = pyfile.split(',') for i in range(len(pyfiles)): if "" != pyfiles[i]: sc.addPyFile(pyfiles[i]) except Exception as e: print("add pyfile error: " + pyfile)
from inception.imagenet_data import ImagenetData print("argv:", argv) sys.argv = argv FLAGS = tf.app.flags.FLAGS FLAGS._parse_flags() print("FLAGS:", FLAGS.__dict__['__flags']) dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() if tf.gfile.Exists(FLAGS.eval_dir): tf.gfile.DeleteRecursively(FLAGS.eval_dir) tf.gfile.MakeDirs(FLAGS.eval_dir) cluster_spec, server = TFNode.start_cluster_server(ctx) inception_eval.evaluate(dataset) if __name__ == '__main__': sc = SparkContext(conf=SparkConf().setAppName("grid_imagenet_eval")) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 0 #cluster = TFCluster.reserve(sc, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW) #cluster.start(main_fun, sys.argv) cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
def test_failed_sparkcontext_creation(self): # Regression test for SPARK-1550 self.sc.stop() self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name")) self.sc = SparkContext("local")
class PySparkStreamingTestCase(unittest.TestCase): timeout = 20 # seconds duration = 1 def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") # TODO: decrease duration to speed up tests self.ssc = StreamingContext(self.sc, self.duration) def tearDown(self): self.ssc.stop() def wait_for(self, result, n): start_time = time.time() while len(result) < n and time.time() - start_time < self.timeout: time.sleep(0.01) if len(result) < n: print "timeout after", self.timeout def _take(self, dstream, n): """ Return the first `n` elements in the stream (will start and stop). """ results = [] def take(_, rdd): if rdd and len(results) < n: results.extend(rdd.take(n - len(results))) dstream.foreachRDD(take) self.ssc.start() self.wait_for(results, n) return results def _collect(self, dstream, n, block=True): """ Collect each RDDs into the returned list. :return: list, which will have the collected items. """ result = [] def get_output(_, rdd): if rdd and len(result) < n: r = rdd.collect() if r: result.append(r) dstream.foreachRDD(get_output) if not block: return result self.ssc.start() self.wait_for(result, n) return result def _test_func(self, input, func, expected, sort=False, input2=None): """ @param input: dataset for the test. This should be list of lists. @param func: wrapped function. This function should return PythonDStream object. @param expected: expected output for this testcase. """ if not isinstance(input[0], RDD): input = [self.sc.parallelize(d, 1) for d in input] input_stream = self.ssc.queueStream(input) if input2 and not isinstance(input2[0], RDD): input2 = [self.sc.parallelize(d, 1) for d in input2] input_stream2 = self.ssc.queueStream( input2) if input2 is not None else None # Apply test function to stream. if input2: stream = func(input_stream, input_stream2) else: stream = func(input_stream) result = self._collect(stream, len(expected)) if sort: self._sort_result_based_on_key(result) self._sort_result_based_on_key(expected) self.assertEqual(expected, result) def _sort_result_based_on_key(self, outputs): """Sort the list based on first value.""" for output in outputs: output.sort(key=lambda x: x[0])
app_name = "Log2Graph" delimiter = "\t" input_file_name = "/Users/woodie/Downloads/sfexpress_rawdata_first2500k.txt" node_info_fields = [ "id", "main_business", "oversea", "industry_lv1", "industry_lv2", "industry_lv3", "area_code", "area_desc", "area_city", "coop_month" ] transc_info_fields = ["transc_id", "ship_timestamp", "deliver_timestamp"] item_info_fields = ["item_info"] src_node_fields = ["src_" + field for field in node_info_fields] trg_node_fields = ["trg_" + field for field in node_info_fields] # Init Spark Context as running in local mode sc = SparkContext("local") # Create a basic Spark Session spark = SparkSession \ .builder \ .appName(app_name) \ .getOrCreate() # Specify properties of fields, # including field name and related data type log_fields = src_node_fields + transc_info_fields + trg_node_fields + item_info_fields # ------------------------------------------ # Pipeline of the Workflow # Load rawdata from local file system # And split each row by specific delimiter source = sc.textFile(input_file_name) \
`features` """ return self._call_java("userFactors") @property @since("1.4.0") def itemFactors(self): """ a DataFrame that stores item factors in two columns: `id` and `features` """ return self._call_java("itemFactors") if __name__ == "__main__": import doctest from pyspark.context import SparkContext from pyspark.sql import SQLContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext("local[2]", "ml.recommendation tests") sqlContext = SQLContext(sc) globs['sc'] = sc globs['sqlContext'] = sqlContext (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) sc.stop() if failure_count: exit(-1)
# skip comment if s.strip().startswith("#"): continue if final_code: final_code += "\n" + s else: final_code = s if sc is None: jsc = kernel.javaSparkContext() if jsc != None: jconf = kernel.sparkConf() conf = SparkConf(_jvm=gateway.jvm, _jconf=jconf) sc = SparkContext(jsc=jsc, gateway=gateway, conf=conf) if final_code: compiled_code = compile(final_code, "<string>", "exec") #sc.setJobGroup(jobGroup, "Spark Kernel") eval(compiled_code) state.markSuccess(code_info.codeId(), output.get()) except Py4JJavaError: excInnerError = traceback.format_exc( ) # format_tb() does not return the inner exception innerErrorStart = excInnerError.find("Py4JJavaError:") if innerErrorStart > -1: excInnerError = excInnerError[innerErrorStart:] state.markFailure(code_info.codeId(), excInnerError + str(sys.exc_info()))
#!/usr/bin/python # -*- coding: utf-8 -*- from datetime import datetime import sys reload(sys) sys.setdefaultencoding('utf-8') import pyspark from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.sql import Row from pyspark.sql.types import * from pyspark.sql.functions import array sc = SparkContext('local') spark = SparkSession(sc) print "begin to map input" fieldSchema = StructType([ StructField("label", IntegerType(), True), StructField("pdef", DoubleType(), True), StructField("pbeau", DoubleType(), True), StructField("pnum", IntegerType(), True), StructField("s_term", StringType(), True), StructField("sumclick", LongType(), True), StructField("sumshow", LongType(), True), StructField("ts", LongType(), True), StructField("uid", LongType(), True), StructField("urlid", LongType(), True), StructField("user_s_term", StringType(), True) ]) train_set_join_user_model = spark.read.csv(
print("num_images: ", num_images) print("num_labels: ", num_labels) print("samples: ", samples) if __name__ == "__main__": import argparse from pyspark.context import SparkContext from pyspark.conf import SparkConf parser = argparse.ArgumentParser() parser.add_argument("--format", help="output format", choices=["csv", "csv2", "pickle", "tf", "tfr"], default="csv") parser.add_argument("--num-partitions", help="Number of output partitions", type=int, default=10) parser.add_argument("--output", help="HDFS directory to save examples in parallelized format", default="mnist_data") parser.add_argument("--read", help="read previously saved examples", action="store_true") parser.add_argument("--verify", help="verify saved examples after writing", action="store_true") args = parser.parse_args() print("args:", args) sc = SparkContext(conf=SparkConf().setAppName("mnist_parallelize")) if not args.read: # Note: these files are inside the mnist.zip file writeMNIST(sc, "mnist/train-images-idx3-ubyte.gz", "mnist/train-labels-idx1-ubyte.gz", args.output + "/train", args.format, args.num_partitions) writeMNIST(sc, "mnist/t10k-images-idx3-ubyte.gz", "mnist/t10k-labels-idx1-ubyte.gz", args.output + "/test", args.format, args.num_partitions) if args.read or args.verify: readMNIST(sc, args.output + "/train", args.format)
''' @Author: Matheus Barros Date: 23/04/2021 ''' from pyspark.context import SparkContext, SparkConf from pyspark.sql.context import SQLContext from pyspark.sql.session import SparkSession #PARALLELIZING WITH 2 CORES conf = SparkConf().setAppName("rdd basic").setMaster("local[2]") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) spark = SparkSession(sc) RDD = sc.parallelize([1, 2, 3, 4]) RDD_map = RDD.map(lambda x: x * x) RDD_map = RDD_map.collect() print(RDD_map) RDD1 = sc.parallelize([1, 2, 3, 4]) RDD_filter = RDD1.filter(lambda x: x > 2) RDD_filter = RDD_filter.collect() print(RDD_filter) RDD2 = sc.parallelize(["hello world", "how are you"])
print("Error: Default Python used is Python%s" % sys.version_info.major) print("\tSet env variable PYSPARK_PYTHON to Python2 binary and re-run it.") sys.exit(1) import os import platform import pyspark from pyspark.context import SparkContext from pyspark.storagelevel import StorageLevel # this is the equivalent of ADD_JARS add_files = os.environ.get("ADD_FILES").split(',') if os.environ.get("ADD_FILES") != None else None if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) sc = SparkContext(appName="PySparkShell", pyFiles=add_files) print("""Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /__ / .__/\_,_/_/ /_/\_\ version 1.0.2 /_/ """) print("Using Python version %s (%s, %s)" % ( platform.python_version(), platform.python_build()[0], platform.python_build()[1])) print("SparkContext available as sc.")
def __init__(self, nbCores = 2): self.sc = SparkContext("local[{}]".format(nbCores)) self.sqlContext = SQLContext(self.sc) self.spark = SparkSession(self.sc)
# -*- coding: utf-8 -*- from pyspark.context import SparkContext import re if __name__ == "__main__": spark = SparkContext("local", "dataAnalyse_floorandage") data = spark.textFile("./transaction/transaction_bj.txt") def reduceAge(str): if str != '未知': age = 2020 - int(str) if age < 5: return "0~5年" elif age < 15: return "5~15年" elif age < 30: return "15~30年" else: return "30年以上" else: return str # 均价 成交价和挂牌价的数据清洗 # 部分房屋没有均价,会错误的提取到成交年份,需要从成交价和面积自己计算,时间原因 暂时放弃 def cleanData(line): line[4] = line[4][:3] line[5] = reduceAge(line[5]) line[10] = re.findall(r"\d+", line[10])[0] line[11] = re.findall(r"\d+", line[11])[0] # if float(line[10]) < 2100:
class TestRDDFunctions(PySparkTestCase): def test_failed_sparkcontext_creation(self): # Regression test for SPARK-1550 self.sc.stop() self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name")) self.sc = SparkContext("local") def test_save_as_textfile_with_unicode(self): # Regression test for SPARK-970 x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_save_as_textfile_with_utf8(self): x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x.encode("utf-8")]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_transforming_cartesian_result(self): # Regression test for SPARK-1034 rdd1 = self.sc.parallelize([1, 2]) rdd2 = self.sc.parallelize([3, 4]) cart = rdd1.cartesian(rdd2) result = cart.map(lambda (x, y): x + y).collect() def test_transforming_pickle_file(self): # Regression test for SPARK-2601 data = self.sc.parallelize(["Hello", "World!"]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsPickleFile(tempFile.name) pickled_file = self.sc.pickleFile(tempFile.name) pickled_file.map(lambda x: x).collect() def test_cartesian_on_textfile(self): # Regression test for path = os.path.join(SPARK_HOME, "python/test_support/hello.txt") a = self.sc.textFile(path) result = a.cartesian(a).collect() (x, y) = result[0] self.assertEqual("Hello World!", x.strip()) self.assertEqual("Hello World!", y.strip()) def test_deleting_input_files(self): # Regression test for SPARK-1025 tempFile = tempfile.NamedTemporaryFile(delete=False) tempFile.write("Hello World!") tempFile.close() data = self.sc.textFile(tempFile.name) filtered_data = data.filter(lambda x: True) self.assertEqual(1, filtered_data.count()) os.unlink(tempFile.name) self.assertRaises(Exception, lambda: filtered_data.count()) def testAggregateByKey(self): data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2) def seqOp(x, y): x.add(y) return x def combOp(x, y): x |= y return x sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect()) self.assertEqual(3, len(sets)) self.assertEqual(set([1]), sets[1]) self.assertEqual(set([2]), sets[3]) self.assertEqual(set([1, 3]), sets[5]) def test_itemgetter(self): rdd = self.sc.parallelize([range(10)]) from operator import itemgetter self.assertEqual([1], rdd.map(itemgetter(1)).collect()) self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect()) def test_namedtuple_in_rdd(self): from collections import namedtuple Person = namedtuple("Person", "id firstName lastName") jon = Person(1, "Jon", "Doe") jane = Person(2, "Jane", "Doe") theDoes = self.sc.parallelize([jon, jane]) self.assertEquals([jon, jane], theDoes.collect()) def test_large_broadcast(self): N = 100000 data = [[float(i) for i in range(300)] for i in range(N)] bdata = self.sc.broadcast(data) # 270MB m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum() self.assertEquals(N, m) def test_zip_with_different_serializers(self): a = self.sc.parallelize(range(5)) b = self.sc.parallelize(range(100, 105)) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) a = a._reserialize(BatchedSerializer(PickleSerializer(), 2)) b = b._reserialize(MarshalSerializer()) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) def test_zip_with_different_number_of_items(self): a = self.sc.parallelize(range(5), 2) # different number of partitions b = self.sc.parallelize(range(100, 106), 3) self.assertRaises(ValueError, lambda: a.zip(b)) # different number of batched items in JVM b = self.sc.parallelize(range(100, 104), 2) self.assertRaises(Exception, lambda: a.zip(b).count()) # different number of items in one pair b = self.sc.parallelize(range(100, 106), 2) self.assertRaises(Exception, lambda: a.zip(b).count()) # same total number of items, but different distributions a = self.sc.parallelize([2, 3], 2).flatMap(range) b = self.sc.parallelize([3, 2], 2).flatMap(range) self.assertEquals(a.count(), b.count()) self.assertRaises(Exception, lambda: a.zip(b).count())
def getStepSize(self): """ Gets the value of stepSize or its default value. """ return self.getOrDefault(self.stepSize) class GBTRegressionModel(JavaModel): """ Model fitted by GBTRegressor. """ if __name__ == "__main__": import doctest from pyspark.context import SparkContext from pyspark.sql import SQLContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext("local[2]", "ml.regression tests") sqlContext = SQLContext(sc) globs['sc'] = sc globs['sqlContext'] = sqlContext (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) sc.stop() if failure_count: exit(-1)
for l, p in zip(labels, preds)] tf_feed.batch_results(results) print("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop() if __name__ == '__main__': sc = SparkContext(conf=SparkConf().setAppName("read hive with model and save to hdfs ")) hive_context = HiveContext(sc) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 num_ps = 1 parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="input hdfs path") parser.add_argument("-o", "--output", help="output hdfs path") parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/inference", default="mnist_model") parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", default=False) parser.add_argument("-b", "--batch_size", help="number of records per batch", type=int, default=100) parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=1) parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000) parser.add_argument("-X", "--mode", help="train|inference", default="train")
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # ETL trang thai Co hoc LS/SC; co hoc thanh cong LS/SC: ############# lay du lieu bang mdl_logsservice_in_out mdl_logsservice_in_out = glueContext.create_dynamic_frame.from_catalog( database="topicalms", table_name="mdl_logsservice_in_out_cutoff") # Chon cac truong can thiet mdl_logsservice_in_out = mdl_logsservice_in_out.select_fields([ '_key', 'id', 'userid', 'roomid', 'time_in', 'time_out', 'date_in', 'action' ]) mdl_logsservice_in_out = mdl_logsservice_in_out.resolveChoice( specs=[('_key', 'cast:long')]) df_flag_1 = spark.read.parquet( "s3://dts-odin/flag/flag_LS_LSSC_CutOff.parquet") max_key = df_flag_1.collect()[0]['flag'] print("max_key: ", max_key) # Chi lay nhung ban ghi lon hon max_key da luu, ko load full mdl_logsservice_in_out = Filter.apply(frame=mdl_logsservice_in_out, f=lambda x: x["_key"] > max_key) # data = mdl_logsservice_in_out.toDF() # data = data.cacahe() # mdl_logsservice_in_out = DynamicFrame.fromDF(data, glueContext, "mdl_logsservice_in_out") print("Count data 1: ", mdl_logsservice_in_out.count()) # mdl_logsservice_in_out.toDF().show() if (mdl_logsservice_in_out.count() > 0): try: mdl_tpebbb = glueContext.create_dynamic_frame.from_catalog( database="topicalms", table_name="mdl_tpebbb") mdl_tpebbb = mdl_tpebbb.select_fields( ['id', 'timeavailable', 'calendar_code', 'roomtype']).rename_field('id', 'room_id') mdl_tpe_calendar_teach = glueContext.create_dynamic_frame.from_catalog( database="topicalms", table_name="mdl_tpe_calendar_teach") mdl_tpe_calendar_teach = mdl_tpe_calendar_teach.select_fields( ['status', 'calendar_code', 'type_class']).rename_field('calendar_code', 'code_calendar') mdl_logsservice_room_start = glueContext.create_dynamic_frame.from_catalog( database="topicalms", table_name="mdl_logsservice_room_start") mdl_logsservice_room_start = mdl_logsservice_room_start.select_fields( ['roomid', 'timecreated']).rename_field('roomid', 'id_room') mdl_role_assignments = glueContext.create_dynamic_frame.from_catalog( database="topicalms", table_name="mdl_role_assignments") mdl_role_assignments = mdl_role_assignments.select_fields( ['userid', 'roleid']).rename_field('userid', 'user_id') # Loc du lieu mdl_tpe_calendar_teach = Filter.apply(frame=mdl_tpe_calendar_teach, f=lambda x: x["status"] >= 0) data_tpe_bbb = Filter.apply( frame=mdl_tpebbb, f=lambda x: x["roomtype"] == 'ROOM' and (x["calendar_code"] is not None and x["calendar_code"] != '')) join_calendar_teach = Join.apply(data_tpe_bbb, mdl_tpe_calendar_teach, 'calendar_code', 'code_calendar').drop_fields([ 'calendar_code', 'code_calendar' ]) data_in_out = Filter.apply( frame=mdl_logsservice_in_out, f=lambda x: x["time_out"] is not None and (x["userid"] is not None and x["userid"] != '') and (x["roomid"] is not None and x["roomid"] != '')) data_mdl_role_assignments = Filter.apply( frame=mdl_role_assignments, f=lambda x: x["roleid"] == '5' and x["user_id"] is not None) join_data_role = Join.apply(data_in_out, data_mdl_role_assignments, 'userid', 'user_id') # map ls lssc vs thong tin lop join_data_tpebbb = Join.apply(join_data_role, join_calendar_teach, 'roomid', 'room_id') mdl_logsservice_room_start = Filter.apply( frame=mdl_logsservice_room_start, f=lambda x: x["id_room"] is not None and x["id_room"] != '') df_data_roomstart = mdl_logsservice_room_start.toDF() df_data_tpebbb = join_data_tpebbb.toDF() print("Count data 222: ", df_data_tpebbb.count()) # df_data_tpebbb.show() # map ls lssc vs thong tin mo lop join_bbb = df_data_tpebbb.join( df_data_roomstart, df_data_tpebbb.roomid == df_data_roomstart.id_room, 'left_outer') data_bbb = DynamicFrame.fromDF(join_bbb, glueContext, "data_bbb") # convert data df_bbb = data_bbb.toDF() df_bbb = df_bbb.withColumn( 'time_start', when(f.col("timecreated").isNull(), df_bbb['timeavailable']).otherwise(df_bbb['timecreated'])) df_bbb = df_bbb.withColumn( 'timein', when(df_bbb.time_in < df_bbb.time_start, df_bbb['time_start']).otherwise(df_bbb['time_in'])) df_bbb = df_bbb.withColumn('time_study', when((df_bbb.time_out < df_bbb.time_in) | (df_bbb.time_out < df_bbb.time_start), f.lit(0)).otherwise(df_bbb.time_out - df_bbb.timein)) \ .withColumn('id_time', from_unixtime(unix_timestamp(df_bbb.date_in, "yyyy-MM-dd"), "yyyyMMdd")) \ .withColumn('date_login', from_unixtime(df_bbb.timein)) \ .withColumn('date_logout', from_unixtime(df_bbb.time_out)) # df_bbb.cache() data_lssc_bbb = DynamicFrame.fromDF(df_bbb, glueContext, "data_lssc_bbb") data_lssc_bbb = data_lssc_bbb.resolveChoice(specs=[('time_study', 'cast:long')]) data_lssc_bbb.printSchema() # chon cac truong va kieu du lieu day vao db applymapping = ApplyMapping.apply( frame=data_lssc_bbb, mappings=[("id", "string", "id", "bigint"), ("userid", "string", "student_id", "string"), ("roomid", 'string', 'room_id', 'string'), ("id_time", 'string', 'date_id', 'bigint'), ("date_login", "string", "time_in", "timestamp"), ("date_logout", "string", "time_out", "timestamp"), ("time_study", "long", "time_study", "long"), ("type_class", "string", "class_type", "string"), ("date_in", "string", "created_time", "timestamp"), ("action", "string", "action", "string")]) resolvechoice = ResolveChoice.apply( frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields") print("Count data: ", dropnullfields.count()) datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "fact_lich_su_hoc", "database": "dts_odin", "postactions": """ call proc_insert_lssc_thanh_cong_dau_tien()""" }, redshift_tmp_dir= "s3n://dts-odin/topicalms/mdl_logsservice_in_out/", transformation_ctx="datasink5") df_lssc = dropnullfields.toDF() # luu bang chuyen trang thai co hoc ls/sc: print("Count data data_df_lsscStudy: ") df_lssc = df_lssc.groupby( 'student_id', 'room_id', 'date_id', 'class_type').agg( f.sum('time_study').alias("measure2_tmp"), f.count('room_id').alias("measure1")) df_lssc = df_lssc.withColumn( 'to_status_id', when(df_lssc.class_type == 'LS', f.lit(30)).otherwise(f.lit(31))) df_lssc = df_lssc.withColumn('measure2', df_lssc.measure2_tmp / 60) print('co_hoc_lssc schema1: ') df_lssc.printSchema() data_df_lsscStudy = DynamicFrame.fromDF(df_lssc, glueContext, "data_df_lsscStudy") data_df_lsscStudy = data_df_lsscStudy.resolveChoice( specs=[('measure1', 'cast:double')]) data_df_lsscStudy = data_df_lsscStudy.resolveChoice( specs=[('measure2', 'cast:double')]) print('co_hoc_lssc schema: ') data_df_lsscStudy.printSchema() applymappingStudy = ApplyMapping.apply( frame=data_df_lsscStudy, mappings=[("student_id", "string", "student_id", "bigint"), ("date_id", "bigint", "change_status_date_id", "bigint"), ("to_status_id", "int", "to_status_id", "bigint"), ("measure1", 'double', 'measure1', 'double'), ("measure2", 'double', 'measure2', 'double')]) resolvechoiceStudy = ResolveChoice.apply( frame=applymappingStudy, choice="make_cols", transformation_ctx="resolvechoiceStudy") dropnullfieldsStudy = DropNullFields.apply( frame=resolvechoiceStudy, transformation_ctx="dropnullfieldsStudy") dropnullfieldsStudy.printSchema() # dropnullfieldsStudy.toDF().show() # insert trang thai co hoc thanh cong ls hoac sc datasinkStudy = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfieldsStudy, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_mapping_status", "database": "dts_odin", "postactions": """ insert into mapping_changed_status_student(student_id, change_status_date_id, to_status_id, measure1, measure2) select student_id, change_status_date_id, to_status_id, measure1, measure2 from temp_mapping_status; update mapping_changed_status_student set user_id = (select user_id from user_map where source_type = 2 and source_id = student_id) where user_id is null; DROP TABLE IF EXISTS temp_mapping_status """ }, redshift_tmp_dir= "s3n://dts-odin/topicalms/mdl_logsservice_in_out/", transformation_ctx="datasinkStudy") # luu bang chuyen trang thai co hoc thanh cong ls/sc: # Hoc thanh cong: thoi gian hoc >= 36phut df_lssc = dropnullfields.toDF() df_lssc = df_lssc.groupby( 'student_id', 'room_id', 'date_id', 'class_type').agg(f.sum('time_study').alias("sum_time_study")) df_lssc = df_lssc.where('sum_time_study >= 2160') df_lssc = df_lssc.groupby( 'student_id', 'date_id', 'class_type').agg( f.sum('sum_time_study').alias("measure2_tmp"), f.count('room_id').alias("measure1")) df_lssc = df_lssc.withColumn( 'to_status_id', when(df_lssc.class_type == 'LS', f.lit(11)).otherwise(f.lit(12))) df_lssc = df_lssc.withColumn('measure2', df_lssc.measure2_tmp / 60) data_df_lssc = DynamicFrame.fromDF(df_lssc, glueContext, "data_df_lssc") data_df_lssc = data_df_lssc.resolveChoice(specs=[('measure1', 'cast:double')]) data_df_lssc = data_df_lssc.resolveChoice(specs=[('measure2', 'cast:double')]) print('data_df_lssc schema: ') data_df_lssc.printSchema() applymappingSuccess = ApplyMapping.apply( frame=data_df_lssc, mappings=[("student_id", "string", "student_id", "bigint"), ("date_id", "bigint", "change_status_date_id", "bigint"), ("to_status_id", "int", "to_status_id", "bigint"), ("measure1", 'double', 'measure1', 'double'), ("measure2", 'double', 'measure2', 'double')]) resolvechoiceSuccess = ResolveChoice.apply( frame=applymappingSuccess, choice="make_cols", transformation_ctx="resolvechoiceSuccess") dropnullfieldsSuccess = DropNullFields.apply( frame=resolvechoiceSuccess, transformation_ctx="dropnullfieldsSuccess") dropnullfieldsSuccess.printSchema() # dropnullfieldsSuccess.toDF().show() ## insert trang thai co hoc thanh cong ls hoac sc datasinkSuccess = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfieldsSuccess, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_mapping_status", "database": "dts_odin", "postactions": """ insert into mapping_changed_status_student(student_id, change_status_date_id, to_status_id, measure1, measure2) select student_id, change_status_date_id, to_status_id, measure1, measure2 from temp_mapping_status; update mapping_changed_status_student set user_id = (select user_id from user_map where source_type = 2 and source_id = student_id) where user_id is null; DROP TABLE IF EXISTS temp_mapping_status """ }, redshift_tmp_dir= "s3n://dts-odin/topicalms/mdl_logsservice_in_out/", transformation_ctx="datasinkSuccess") # data_df_lssc.printSchema() # df_lssc = data_df_lssc.toDF() # print "Count data: ", dropnullfields1.count() # dropnullfields1.toDF().show() # ghi flag # lay max key trong data source datasourceTmp = mdl_logsservice_in_out.toDF() flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dts-odin/flag/flag_LS_LSSC_CutOff.parquet", mode="overwrite") except Exception as e: print("No new data") print(e)
def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ self.sc = SparkContext('local[4]', class_name, batchSize=2)
from __future__ import print_function from pyspark.conf import SparkConf from pyspark.context import SparkContext config = SparkConf() config.setMaster("local[*]") config.setAppName("ParallelizeJOB") sc = SparkContext(conf=config) dataRDD = sc.parallelize([100, 200, 300, 400]) #<class 'pyspark.rdd.RDD'> print(type(dataRDD)) dataRDD.foreach(lambda eachElement: print(eachElement))
'input_table', 'output_database', 'output_table', 'output_path' ] args = getResolvedOptions(sys.argv, params) region = args['region'] input_database = args['input_database'] input_table = args['input_table'] output_database = args['output_database'] output_table = args['output_table'] output_path = args['output_path'] glue_context = GlueContext(SparkContext.getOrCreate()) spark = glue_context.spark_session job = Job(glue_context) job.init(args['JOB_NAME'], args) # Create DynamicFrame from Data Catalog dyf = glue_context.create_dynamic_frame.from_catalog( database=input_database, table_name=input_table, transformation_ctx='dyf' ) # Resolve choice type with make_struct dyf = ResolveChoice.apply( frame=dyf,
from pyspark.context import SparkContext as sc if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("ChiSqSelectorExample") \ .getOrCreate() rawData = spark.sparkContext.textFile("file:///home/tianlei/iris.txt") def f(x): rel = {} rel['features'] = Vectors.dense(float(x[0]), float(x[1]), float(x[2]), float(x[3])) return rel df = sc.textFile("file:///usr/local/spark/iris.txt").map( lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF() # 我们建立一个简单的GaussianMixture对象,设定其聚类数目为3,其他参数取默认值。 gm = GaussianMixture().setK(3).setPredictionCol( "Prediction").setProbabilityCol("Probability") gmm = gm.fit(df) # 调用transform()方法处理数据集之后,打印数据集,可以看到每一个样本的预测簇以及其概率分布向量 # (这里为了明晰起见,省略了大部分行,只选择三行): result = gmm.transform(df) result.show(150, False) # 得到模型后,即可查看模型的相关参数,与KMeans方法不同,GMM不直接给出聚类中心, # 而是给出各个混合成分(多元高斯分布)的参数。在ML的实现中, # GMM的每一个混合成分都使用一个MultivariateGaussian类(位于org.apache.spark.ml.stat.distribution包)来存储, # 我们可以使用GaussianMixtureModel类的weights成员获取到各个混合成分的权重, # 使用gaussians成员来获取到各个混合成分的参数(均值向量和协方差矩阵): for i in range(3): print("Component " + str(i) + " : weight is " + str(gmm.weights[i]) +
# coding=utf-8 import sys # 由于PYTHONPATH找不到pyspark包,这里手动添加路径 sys.path.append('/usr/local/spark-2.1.1-bin-hadoop2.7/python') from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.ml.classification import NaiveBayes from pyspark.ml.evaluation import MulticlassClassificationEvaluator sc = SparkContext('local', 'logistic_regression') spark = SparkSession(sc) # Load training data data = spark.read.format("libsvm").load("../data/mllib/sample_libsvm_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # train the model model = nb.fit(train) # select example rows to display. predictions = model.transform(test)