def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. >>> sc.parallelize(range(5), 5).glom().collect() [[0], [1], [2], [3], [4]] """ numSlices = numSlices or self.defaultParallelism # Calling the Java parallelize() method with an ArrayList is too slow, # because it sends O(n) Py4J commands. As an alternative, serialized # objects are written to a file and loaded through textFile(). tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) # Make sure we distribute data evenly if it's smaller than self.batchSize if "__len__" not in dir(c): c = list(c) # Make it a list so we can compute its length batchSize = min(len(c) // numSlices, self._batchSize) if batchSize > 1: serializer = BatchedSerializer(self._unbatched_serializer, batchSize) else: serializer = self._unbatched_serializer serializer.dump_stream(c, tempFile) tempFile.close() readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices) return RDD(jrdd, self, serializer)
def __init__(self, vertex_jrdd, edge_jrdd, partition_strategy=PartitionStrategy.EdgePartition1D): self._vertex_jrdd = VertexRDD(vertex_jrdd, vertex_jrdd.context, BatchedSerializer(PickleSerializer())) self._edge_jrdd = EdgeRDD(edge_jrdd, edge_jrdd.context, BatchedSerializer(PickleSerializer())) self._partition_strategy = partition_strategy self._jsc = vertex_jrdd.context
def _open_file(self): dirs = _get_local_dirs("objects") d = dirs[id(self) % len(dirs)] if not os.path.exists(d): os.makedirs(d) p = os.path.join(d, str(id(self))) self._file = open(p, "w+b", 65536) self._ser = BatchedSerializer(CompressedSerializer(PickleSerializer()), 1024) os.unlink(p)
def read_udfs(pickleSer, infile, eval_type): runner_conf = {} if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF, PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF): # Load conf used for pandas_udf evaluation num_conf = read_int(infile) for i in range(num_conf): k = utf8_deserializer.loads(infile) v = utf8_deserializer.loads(infile) runner_conf[k] = v # NOTE: if timezone is set here, that implies respectSessionTimeZone is True timezone = runner_conf.get("spark.sql.session.timeZone", None) ser = ArrowStreamPandasSerializer(timezone) else: ser = BatchedSerializer(PickleSerializer(), 100) num_udfs = read_int(infile) udfs = {} call_udf = [] mapper_str = "" if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: # Create function like this: # lambda a: f([a[0]], [a[0], a[1]]) # We assume there is only one UDF here because grouped map doesn't # support combining multiple UDFs. assert num_udfs == 1 # See FlatMapGroupsInPandasExec for how arg_offsets are used to # distinguish between grouping attributes and data attributes arg_offsets, udf = read_single_udf( pickleSer, infile, eval_type, runner_conf, udf_index=0) udfs['f'] = udf split_offset = arg_offsets[0] + 1 arg0 = ["a[%d]" % o for o in arg_offsets[1: split_offset]] arg1 = ["a[%d]" % o for o in arg_offsets[split_offset:]] mapper_str = "lambda a: f([%s], [%s])" % (", ".join(arg0), ", ".join(arg1)) else: # Create function like this: # lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3])) # In the special case of a single UDF this will return a single result rather # than a tuple of results; this is the format that the JVM side expects. for i in range(num_udfs): arg_offsets, udf = read_single_udf( pickleSer, infile, eval_type, runner_conf, udf_index=i) udfs['f%d' % i] = udf args = ["a[%d]" % o for o in arg_offsets] call_udf.append("f%d(%s)" % (i, ", ".join(args))) mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) mapper = eval(mapper_str, udfs) func = lambda _, it: map(mapper, it) # profiling is not supported for UDF return func, None, ser, ser
def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. Using xrange is recommended if the input represents a range for performance. >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect() [[0], [2], [3], [4], [6]] >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect() [[], [0], [], [2], [4]] """ numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism if isinstance(c, xrange): size = len(c) if size == 0: return self.parallelize([], numSlices) step = c[1] - c[0] if size > 1 else 1 start0 = c[0] def getStart(split): return start0 + int((split * size / numSlices)) * step def f(split, iterator): return xrange(getStart(split), getStart(split + 1), step) return self.parallelize([], numSlices).mapPartitionsWithIndex(f) # Make sure we distribute data evenly if it's smaller than self.batchSize if "__len__" not in dir(c): c = list(c) # Make it a list so we can compute its length batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024)) serializer = BatchedSerializer(self._unbatched_serializer, batchSize) jrdd = self._serialize_to_jvm(c, numSlices, serializer) return RDD(jrdd, self, serializer)
def test_zip_with_different_serializers(self): a = self.sc.parallelize(range(5)) b = self.sc.parallelize(range(100, 105)) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) a = a._reserialize(BatchedSerializer(PickleSerializer(), 2)) b = b._reserialize(MarshalSerializer()) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
def newAPIHadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=None, valueConverter=None, conf=None, batchSize=None): """ Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. The mechanism is the same as for sc.sequenceFile. A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java @param path: path to Hadoop file @param inputFormatClass: fully qualified classname of Hadoop InputFormat (e.g. "org.apache.hadoop.mapreduce.lib.input.TextInputFormat") @param keyClass: fully qualified classname of key Writable class (e.g. "org.apache.hadoop.io.Text") @param valueClass: fully qualified classname of value Writable class (e.g. "org.apache.hadoop.io.LongWritable") @param keyConverter: (None by default) @param valueConverter: (None by default) @param conf: Hadoop configuration, passed in as a dict (None by default) @param batchSize: The number of Python objects represented as a single Java object. (default sc._default_batch_size_for_serialized_input) """ jconf = self._dictToJavaMap(conf) batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input) ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer() jrdd = self._jvm.PythonRDD.newAPIHadoopFile(self._jsc, path, inputFormatClass, keyClass, valueClass, keyConverter, valueConverter, jconf, batchSize) return RDD(jrdd, self, ser)
def hadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None, valueConverter=None, conf=None, batchSize=None): """ Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration, which is passed in as a Python dict. This will be converted into a Configuration in Java. The mechanism is the same as for sc.sequenceFile. @param inputFormatClass: fully qualified classname of Hadoop InputFormat (e.g. "org.apache.hadoop.mapred.TextInputFormat") @param keyClass: fully qualified classname of key Writable class (e.g. "org.apache.hadoop.io.Text") @param valueClass: fully qualified classname of value Writable class (e.g. "org.apache.hadoop.io.LongWritable") @param keyConverter: (None by default) @param valueConverter: (None by default) @param conf: Hadoop configuration, passed in as a dict (None by default) @param batchSize: The number of Python objects represented as a single Java object. (default sc._default_batch_size_for_serialized_input) """ jconf = self._dictToJavaMap(conf) batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input) ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer() jrdd = self._jvm.PythonRDD.hadoopRDD(self._jsc, inputFormatClass, keyClass, valueClass, keyConverter, valueConverter, jconf, batchSize) return RDD(jrdd, self, ser)
def __init__(self, aggregator, memory_limit=512, serializer=None, localdirs=None, scale=1, partitions=59, batch=1000): Merger.__init__(self, aggregator) self.memory_limit = memory_limit # default serializer is only used for tests self.serializer = serializer or \ BatchedSerializer(PickleSerializer(), 1024) self.localdirs = localdirs or _get_local_dirs(str(id(self))) # number of partitions when spill data into disks self.partitions = partitions # check the memory after # of items merged self.batch = batch # scale is used to scale down the hash of key for recursive hash map self.scale = scale # unpartitioned merged data self.data = {} # partitioned merged data, list of dicts self.pdata = [] # number of chunks dumped into disks self.spills = 0 # randomize the hash of key, id(o) is the address of o (aligned by 8) self._seed = id(self) + 7
def sequenceFile(self, path, keyClass=None, valueClass=None, keyConverter=None, valueConverter=None, minSplits=None, batchSize=None): """ Read a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. The mechanism is as follows: 1. A Java RDD is created from the SequenceFile or other InputFormat, and the key and value Writable classes 2. Serialization is attempted via Pyrolite pickling 3. If this fails, the fallback is to call 'toString' on each key and value 4. C{PickleSerializer} is used to deserialize pickled objects on the Python side @param path: path to sequncefile @param keyClass: fully qualified classname of key Writable class (e.g. "org.apache.hadoop.io.Text") @param valueClass: fully qualified classname of value Writable class (e.g. "org.apache.hadoop.io.LongWritable") @param keyConverter: @param valueConverter: @param minSplits: minimum splits in dataset (default min(2, sc.defaultParallelism)) @param batchSize: The number of Python objects represented as a single Java object. (default sc._default_batch_size_for_serialized_input) """ minSplits = minSplits or min(self.defaultParallelism, 2) batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input) ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer() jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, path, keyClass, valueClass, keyConverter, valueConverter, minSplits, batchSize) return RDD(jrdd, self, ser)
def predict(self, x): """ Predict the label of one or more examples. :param x: Data point (feature vector), or an RDD of data points (feature vectors). """ SerDe = self._sc._jvm.SerDe ser = PickleSerializer() if isinstance(x, RDD): # Bulk prediction first = x.take(1) if not first: return self._sc.parallelize([]) if not isinstance(first[0], Vector): x = x.map(_convert_to_vector) jPred = self._java_model.predict(x._to_java_object_rdd()).toJavaRDD() jpyrdd = self._sc._jvm.PythonRDD.javaToPython(jPred) return RDD(jpyrdd, self._sc, BatchedSerializer(ser, 1024)) else: # Assume x is a single data point. bytes = bytearray(ser.dumps(_convert_to_vector(x))) vec = self._sc._jvm.SerDe.loads(bytes) return self._java_model.predict(vec)
def rdd(self): """Returns the content as an :class:`pyspark.RDD` of :class:`Row`. """ if self._lazy_rdd is None: jrdd = self._jdf.javaToPython() self._lazy_rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer())) return self._lazy_rdd
def read_udfs(pickleSer, infile, eval_type): num_udfs = read_int(infile) udfs = {} call_udf = [] for i in range(num_udfs): arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type) udfs['f%d' % i] = udf args = ["a[%d]" % o for o in arg_offsets] call_udf.append("f%d(%s)" % (i, ", ".join(args))) # Create function like this: # lambda a: (f0(a0), f1(a1, a2), f2(a3)) # In the special case of a single UDF this will return a single result rather # than a tuple of results; this is the format that the JVM side expects. mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) mapper = eval(mapper_str, udfs) func = lambda _, it: map(mapper, it) if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF): timezone = utf8_deserializer.loads(infile) ser = ArrowStreamPandasSerializer(timezone) else: ser = BatchedSerializer(PickleSerializer(), 100) # profiling is not supported for UDF return func, None, ser, ser
def loadLabeledPoints(sc, path, minPartitions=None): """ Load labeled points saved using RDD.saveAsTextFile. :param sc: Spark context :param path: file or directory path in any Hadoop-supported file system URI :param minPartitions: min number of partitions @return: labeled data stored as an RDD of LabeledPoint >>> from tempfile import NamedTemporaryFile >>> from pyspark.mllib.util import MLUtils >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \ LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))] >>> tempFile = NamedTemporaryFile(delete=True) >>> tempFile.close() >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name) >>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect() >>> type(loaded[0]) == LabeledPoint True >>> print examples[0] (1.1,(3,[0,2],[-1.23,4.56e-07])) >>> type(examples[1]) == LabeledPoint True >>> print examples[1] (0.0,[1.01,2.02,3.03]) """ minPartitions = minPartitions or min(sc.defaultParallelism, 2) jrdd = sc._jvm.PythonMLLibAPI().loadLabeledPoints( sc._jsc, path, minPartitions) jpyrdd = sc._jvm.PythonRDD.javaToPython(jrdd) return RDD(jpyrdd, sc, BatchedSerializer(PickleSerializer()))
def partitionBy(self, numPartitions, partitionFunc=hash): """ Return a copy of the RDD partitioned using the specified partitioner. >>> pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(lambda x: (x, x)) >>> sets = pairs.partitionBy(2).glom().collect() >>> set(sets[0]).intersection(set(sets[1])) set([]) """ if numPartitions is None: numPartitions = self.ctx.defaultParallelism # Transferring O(n) objects to Java is too expensive. Instead, we'll # form the hash buckets in Python, transferring O(numPartitions) objects # to Java. Each object is a (splitNumber, [objects]) pair. outputSerializer = self.ctx._unbatched_serializer def add_shuffle_key(split, iterator): buckets = defaultdict(list) for (k, v) in iterator: buckets[partitionFunc(k) % numPartitions].append((k, v)) for (split, items) in buckets.iteritems(): yield pack_long(split) yield outputSerializer.dumps(items) keyed = PipelinedRDD(self, add_shuffle_key) keyed._bypass_serializer = True pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD() partitioner = self.ctx._jvm.PythonPartitioner(numPartitions, id(partitionFunc)) jrdd = pairRDD.partitionBy(partitioner).values() rdd = RDD(jrdd, self.ctx, BatchedSerializer(outputSerializer)) # This is required so that id(partitionFunc) remains unique, even if # partitionFunc is a lambda: rdd._partitionFunc = partitionFunc return rdd
def transformToRDD(cursor, sc, parallelism=1): """ Transform a StellarCursor to a Python RDD object param cursor: StellarCursor param sc: SparkContext param parallelism: Parallelism of RDD """ # Get all data from cursor data = cursor.fetchall() # Set parallelism parallelism = max(1, parallelism) def reader_func(temp_filename): return sc._jvm.PythonRDD.readRDDFromFile(sc._jsc, temp_filename, parallelism) def createRDDServer(): return sc._jvm.PythonParallelizeServer(sc._jsc.sc(), parallelism) batchSize = max(1, min(len(data) // parallelism, 1024)) serializer = BatchedSerializer(sc._unbatched_serializer, batchSize) jrdd = sc._serialize_to_jvm(data, serializer, reader_func, createRDDServer) return RDD(jrdd, sc, serializer)
def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. Using xrange is recommended if the input represents a range for performance. 分发一个本地的Python集合以形成一个RDD。如果输入表示的是一个范围推荐使用xrange。 >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect() [[0], [2], [3], [4], [6]] >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect() [[], [0], [], [2], [4]] """ numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism if isinstance(c, xrange): size = len(c) if size == 0: return self.parallelize([], numSlices) step = c[1] - c[0] if size > 1 else 1 start0 = c[0] def getStart(split): return start0 + int((split * size / numSlices)) * step def f(split, iterator): return xrange(getStart(split), getStart(split + 1), step) return self.parallelize([], numSlices).mapPartitionsWithIndex(f) # Calling the Java parallelize() method with an ArrayList is too slow, # because it sends O(n) Py4J commands. As an alternative, serialized # objects are written to a file and loaded through textFile(). #对一个ArrayList调用Java的并行化方法速度太慢,因为它要发送O(n)Py4J命令。作为替代,序列化对象被写入到一个文件中,并通过textFile()加载。 tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) try: # Make sure we distribute data evenly if it's smaller than self.batchSize #确保我们均匀地分布数据如果它小于self.batchsize if "__len__" not in dir(c): c = list(c) # Make it a list so we can compute its length将其转换成list,以便可以计算它的长度 batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024))#计算块大小 serializer = BatchedSerializer(self._unbatched_serializer, batchSize)#获取分块后的序列化对象 serializer.dump_stream(c, tempFile)#写入临时文件中 tempFile.close()#关闭临时文件 readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices) finally: # readRDDFromFile eagerily reads the file so we can delete right after. #readRDDFromFile将快速读取文件,以便我们可以在之后删除 os.unlink(tempFile.name)#删除临时文件,如果文件是一个目录则返回一个错误 return RDD(jrdd, self, serializer)
def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. Using xrange is recommended if the input represents a range for performance. >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect() [[0], [2], [3], [4], [6]] >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect() [[], [0], [], [2], [4]] """ numSlices = int( numSlices) if numSlices is not None else self.defaultParallelism if isinstance(c, xrange): size = len(c) if size == 0: return self.parallelize([], numSlices) step = c[1] - c[0] if size > 1 else 1 start0 = c[0] def getStart(split): return start0 + int((split * size / numSlices)) * step def f(split, iterator): return xrange(getStart(split), getStart(split + 1), step) return self.parallelize([], numSlices).mapPartitionsWithIndex(f) # Calling the Java parallelize() method with an ArrayList is too slow, # because it sends O(n) Py4J commands. As an alternative, serialized # objects are written to a file and loaded through textFile(). tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) try: # Make sure we distribute data evenly if it's smaller than self.batchSize if "__len__" not in dir(c): c = list(c) # Make it a list so we can compute its length batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024)) serializer = BatchedSerializer(self._unbatched_serializer, batchSize) serializer.dump_stream(c, tempFile) tempFile.close() readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices) finally: # readRDDFromFile eagerily reads the file so we can delete right after. os.unlink(tempFile.name) return RDD(jrdd, self, serializer)
def collect(self): """Returns all the records as a list of :class:`Row`. >>> df.collect() [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] """ with SCCallSiteSync(self._sc) as css: port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd()) return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
def _open_file(self): dirs = _get_local_dirs("objects") d = dirs[id(self) % len(dirs)] if not os.path.exists(d): os.makedirs(d) p = os.path.join(d, str(id(self))) self._file = open(p, "wb+", 65536) self._ser = BatchedSerializer(CompressedSerializer(PickleSerializer()), 1024) os.unlink(p)
def _toPython(self): # We have to import the Row class explicitly, so that the reference Pickler has is # pyspark.sql.Row instead of __main__.Row from pyspark.sql import Row jrdd = self._jschema_rdd.javaToPython() # TODO: This is inefficient, we should construct the Python Row object # in Java land in the javaToPython function. May require a custom # pickle serializer in Pyrolite return RDD(jrdd, self._sc, BatchedSerializer(PickleSerializer())).map(lambda d: Row(d))
def take(self, num): """Returns the first ``num`` rows as a :class:`list` of :class:`Row`. >>> df.take(2) [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] """ with SCCallSiteSync(self._sc) as css: port = self._sc._jvm.org.apache.spark.sql.execution.EvaluatePython.takeAndServe( self._jdf, num) return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. Using xrange is recommended if the input represents a range for performance. >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect() [[0], [2], [3], [4], [6]] >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect() [[], [0], [], [2], [4]] """ numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism if isinstance(c, xrange): size = len(c) if size == 0: return self.parallelize([], numSlices) step = c[1] - c[0] if size > 1 else 1 start0 = c[0] def getStart(split): return start0 + int((split * size / numSlices)) * step def f(split, iterator): return xrange(getStart(split), getStart(split + 1), step) return self.parallelize([], numSlices).mapPartitionsWithIndex(f) # Calling the Java parallelize() method with an ArrayList is too slow, # because it sends O(n) Py4J commands. As an alternative, serialized # objects are written to a file and loaded through textFile(). tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) try: # Make sure we distribute data evenly if it's smaller than self.batchSize if "__len__" not in dir(c): c = list(c) # Make it a list so we can compute its length batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024)) serializer = BatchedSerializer(self._unbatched_serializer, batchSize) serializer.dump_stream(c, tempFile) tempFile.close() readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices) finally: # readRDDFromFile eagerily reads the file so we can delete right after. os.unlink(tempFile.name) return RDD(jrdd, self, serializer)
def test_hash_serializer(self): hash(NoOpSerializer()) hash(UTF8Deserializer()) hash(CPickleSerializer()) hash(MarshalSerializer()) hash(AutoSerializer()) hash(BatchedSerializer(CPickleSerializer())) hash(AutoBatchedSerializer(MarshalSerializer())) hash(PairDeserializer(NoOpSerializer(), UTF8Deserializer())) hash(CartesianDeserializer(NoOpSerializer(), UTF8Deserializer())) hash(CompressedSerializer(CPickleSerializer())) hash(FlattenedValuesSerializer(CPickleSerializer()))
def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. Using xrange is recommended if the input represents a range for performance. >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect() [[0], [2], [3], [4], [6]] >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect() [[], [0], [], [2], [4]] """ numSlices = int( numSlices) if numSlices is not None else self.defaultParallelism if isinstance(c, xrange): size = len(c) if size == 0: return self.parallelize([], numSlices) step = c[1] - c[0] if size > 1 else 1 start0 = c[0] def getStart(split): return start0 + int((split * size / numSlices)) * step def f(split, iterator): # it's an empty iterator here but we need this line for triggering the # logic of signal handling in FramedSerializer.load_stream, for instance, # SpecialLengths.END_OF_DATA_SECTION in _read_with_length. Since # FramedSerializer.load_stream produces a generator, the control should # at least be in that function once. Here we do it by explicitly converting # the empty iterator to a list, thus make sure worker reuse takes effect. # See more details in SPARK-26549. assert len(list(iterator)) == 0 return xrange(getStart(split), getStart(split + 1), step) return self.parallelize([], numSlices).mapPartitionsWithIndex(f) # Make sure we distribute data evenly if it's smaller than self.batchSize if "__len__" not in dir(c): c = list(c) # Make it a list so we can compute its length batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024)) serializer = BatchedSerializer(self._unbatched_serializer, batchSize) def reader_func(temp_filename): return self._jvm.PythonRDD.readRDDFromFile(self._jsc, temp_filename, numSlices) def createRDDServer(): return self._jvm.PythonParallelizeServer(self._jsc.sc(), numSlices) jrdd = self._serialize_to_jvm(c, serializer, reader_func, createRDDServer) return RDD(jrdd, self, serializer)
def cassandraTable(self, keyspace, table): """Returns all the Rows in a Cassandra keyspace and table as an RDD. @param keyspace: Cassandra keyspace / schema name @param table: Cassandra table / column family name """ # Unsure right now if we need CassandraSerializer, but likely do since # we'll get generic CassandraRow instances back that we'll need to # inspect? # return RDD(self._jcsc.cassandraTable(keyspace, table), self, # CassandraSerializer()) return RDD(self._jcsc.cassandraTable(keyspace, table), self, BatchedSerializer(PickleSerializer()))
def pickleFile(self, name, minPartitions=None): """ Load an RDD previously saved using L{RDD.saveAsPickleFile} method. >>> tmpFile = NamedTemporaryFile(delete=True) >>> tmpFile.close() >>> sc.parallelize(range(10)).saveAsPickleFile(tmpFile.name, 5) >>> sorted(sc.pickleFile(tmpFile.name, 3).collect()) [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] """ minPartitions = minPartitions or self.defaultMinPartitions return RDD(self._jsc.objectFile(name, minPartitions), self, BatchedSerializer(PickleSerializer()))
def collect(self): """Return a list that contains all of the rows. Each object in the list is a Row, the fields can be accessed as attributes. >>> df.collect() [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] """ with SCCallSiteSync(self._sc) as css: port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd()) rs = list(_load_from_socket(port, BatchedSerializer(PickleSerializer()))) cls = _create_cls(self.schema) return [cls(r) for r in rs]
def rdd(self): """Returns the content as an :class:`pyspark.RDD` of :class:`Row`. """ if self._lazy_rdd is None: jrdd = self._jdf.javaToPython() rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer())) schema = self.schema def applySchema(it): cls = _create_cls(schema) return map(cls, it) self._lazy_rdd = rdd.mapPartitions(applySchema) return self._lazy_rdd
def test_zip_with_different_serializers(self): a = self.sc.parallelize(range(5)) b = self.sc.parallelize(range(100, 105)) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) a = a._reserialize(BatchedSerializer(CPickleSerializer(), 2)) b = b._reserialize(MarshalSerializer()) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) # regression test for SPARK-4841 path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt") t = self.sc.textFile(path) cnt = t.count() self.assertEqual(cnt, t.zip(t).count()) rdd = t.map(str) self.assertEqual(cnt, t.zip(rdd).count()) # regression test for bug in _reserializer() self.assertEqual(cnt, t.zip(rdd).count())
def rdd(self): """ Return the content of the :class:`DataFrame` as an :class:`RDD` of :class:`Row` s. """ if not hasattr(self, '_lazy_rdd'): jrdd = self._jdf.javaToPython() rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer())) schema = self.schema def applySchema(it): cls = _create_cls(schema) return itertools.imap(cls, it) self._lazy_rdd = rdd.mapPartitions(applySchema) return self._lazy_rdd
def read_udfs(pickleSer, infile): num_udfs = read_int(infile) udfs = {} call_udf = [] for i in range(num_udfs): arg_offsets, udf = read_single_udf(pickleSer, infile) udfs['f%d' % i] = udf args = ["a[%d]" % o for o in arg_offsets] call_udf.append("f%d(%s)" % (i, ", ".join(args))) # Create function like this: # lambda a: (f0(a0), f1(a1, a2), f2(a3)) # In the special case of a single UDF this will return a single result rather # than a tuple of results; this is the format that the JVM side expects. mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) mapper = eval(mapper_str, udfs) func = lambda _, it: map(mapper, it) ser = BatchedSerializer(PickleSerializer(), 100) # profiling is not supported for UDF return func, None, ser, ser
class ExternalList(object): """ ExternalList can have many items which cannot be hold in memory in the same time. >>> l = ExternalList(list(range(100))) >>> len(l) 100 >>> l.append(10) >>> len(l) 101 >>> for i in range(20240): ... l.append(i) >>> len(l) 20341 >>> import pickle >>> l2 = pickle.loads(pickle.dumps(l)) >>> len(l2) 20341 >>> list(l2)[100] 10 """ LIMIT = 10240 def __init__(self, values): self.values = values self.count = len(values) self._file = None self._ser = None def __getstate__(self): if self._file is not None: self._file.flush() with os.fdopen(os.dup(self._file.fileno()), "rb") as f: f.seek(0) serialized = f.read() else: serialized = b'' return self.values, self.count, serialized def __setstate__(self, item): self.values, self.count, serialized = item if serialized: self._open_file() self._file.write(serialized) else: self._file = None self._ser = None def __iter__(self): if self._file is not None: self._file.flush() # read all items from disks first with os.fdopen(os.dup(self._file.fileno()), 'rb') as f: f.seek(0) for v in self._ser.load_stream(f): yield v for v in self.values: yield v def __len__(self): return self.count def append(self, value): self.values.append(value) self.count += 1 # dump them into disk if the key is huge if len(self.values) >= self.LIMIT: self._spill() def _open_file(self): dirs = _get_local_dirs("objects") d = dirs[id(self) % len(dirs)] if not os.path.exists(d): os.makedirs(d) p = os.path.join(d, str(id(self))) self._file = open(p, "wb+", 65536) self._ser = BatchedSerializer(CompressedSerializer(PickleSerializer()), 1024) os.unlink(p) def __del__(self): if self._file: self._file.close() self._file = None def _spill(self): """ dump the values into disk """ global MemoryBytesSpilled, DiskBytesSpilled if self._file is None: self._open_file() used_memory = get_used_memory() pos = self._file.tell() self._ser.dump_stream(self.values, self._file) self.values = [] gc.collect() DiskBytesSpilled += self._file.tell() - pos MemoryBytesSpilled += (used_memory - get_used_memory()) << 20