def _collect_as_arrow(self): """ Returns all records as a list of ArrowRecordBatches, pyarrow must be installed and available on driver and worker Python environments. .. note:: Experimental. """ from pyspark.sql.dataframe import DataFrame assert isinstance(self, DataFrame) with SCCallSiteSync(self._sc): port, auth_secret, jsocket_auth_server = self._jdf.collectAsArrowToPython( ) # Collect list of un-ordered batches where last element is a list of correct order indices try: results = list( _load_from_socket((port, auth_secret), ArrowCollectSerializer())) finally: # Join serving thread and raise any exceptions from collectAsArrowToPython jsocket_auth_server.getResult() # Separate RecordBatches from batch order indices in results batches = results[:-1] batch_order = results[-1] # Re-order the batch list using the correct order return [batches[i] for i in batch_order]
def collect(self): """ Return a list that contains all of the elements in this RDD. """ with SCCallSiteSync(self.ctx) as css: bytesInJava = self.jedge_rdd.collect().iterator() return list(self._collect_iterator_through_file(bytesInJava))
def _collect_as_arrow(self, split_batches: bool = False) -> List["pa.RecordBatch"]: """ Returns all records as a list of ArrowRecordBatches, pyarrow must be installed and available on driver and worker Python environments. This is an experimental feature. :param split_batches: split batches such that each column is in its own allocation, so that the selfDestruct optimization is effective; default False. .. note:: Experimental. """ from pyspark.sql.dataframe import DataFrame assert isinstance(self, DataFrame) with SCCallSiteSync(self._sc): ( port, auth_secret, jsocket_auth_server, ) = self._jdf.collectAsArrowToPython() # Collect list of un-ordered batches where last element is a list of correct order indices try: batch_stream = _load_from_socket((port, auth_secret), ArrowCollectSerializer()) if split_batches: # When spark.sql.execution.arrow.pyspark.selfDestruct.enabled, ensure # each column in each record batch is contained in its own allocation. # Otherwise, selfDestruct does nothing; it frees each column as its # converted, but each column will actually be a list of slices of record # batches, and so no memory is actually freed until all columns are # converted. import pyarrow as pa results = [] for batch_or_indices in batch_stream: if isinstance(batch_or_indices, pa.RecordBatch): batch_or_indices = pa.RecordBatch.from_arrays( [ # This call actually reallocates the array pa.concat_arrays([array]) for array in batch_or_indices ], schema=batch_or_indices.schema, ) results.append(batch_or_indices) else: results = list(batch_stream) finally: # Join serving thread and raise any exceptions from collectAsArrowToPython jsocket_auth_server.getResult() # Separate RecordBatches from batch order indices in results batches = results[:-1] batch_order = results[-1] # Re-order the batch list using the correct order return [batches[i] for i in batch_order]
def collect(self): """Returns all the records as a list of :class:`Row`. >>> df.collect() [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] """ with SCCallSiteSync(self._sc) as css: port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd()) return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
def take(self, num): """Returns the first ``num`` rows as a :class:`list` of :class:`Row`. >>> df.take(2) [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] """ with SCCallSiteSync(self._sc) as css: port = self._sc._jvm.org.apache.spark.sql.execution.EvaluatePython.takeAndServe( self._jdf, num) return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
def collect(self): """Return a list that contains all of the rows. Each object in the list is a Row, the fields can be accessed as attributes. >>> df.collect() [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] """ with SCCallSiteSync(self._sc) as css: port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd()) rs = list(_load_from_socket(port, BatchedSerializer(PickleSerializer()))) cls = _create_cls(self.schema) return [cls(r) for r in rs]
def collect(self): """Return a list that contains all of the rows. Each object in the list is a Row, the fields can be accessed as attributes. >>> df.collect() [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] """ with SCCallSiteSync(self._sc) as css: bytesInJava = self._jdf.javaToPython().collect().iterator() tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir) tempFile.close() self._sc._writeToFile(bytesInJava, tempFile.name) # Read the data into Python and deserialize it: with open(tempFile.name, 'rb') as tempFile: rs = list( BatchedSerializer(PickleSerializer()).load_stream(tempFile)) os.unlink(tempFile.name) cls = _create_cls(self.schema) return [cls(r) for r in rs]
def partitionBy(self, numPartitions, partitionFunc=portable_hash): """ Return a copy of the DStream partitioned using the specified partitioner. """ if numPartitions is None: numPartitions = self.ctx._defaultReducePartitions() # Transferring O(n) objects to Java is too expensive. Instead, we'll # form the hash buckets in Python, transferring O(numPartitions) objects # to Java. Each object is a (splitNumber, [objects]) pair. outputSerializer = self.ctx._unbatched_serializer # # def add_shuffle_key(split, iterator): # buckets = defaultdict(list) # # for (k, v) in iterator: # buckets[partitionFunc(k) % numPartitions].append((k, v)) # for (split, items) in buckets.iteritems(): # yield pack_long(split) # yield outputSerializer.dumps(items) # keyed = PipelinedDStream(self, add_shuffle_key) limit = (_parse_memory( self.ctx._conf.get("spark.python.worker.memory", "512m")) / 2) def add_shuffle_key(split, iterator): buckets = defaultdict(list) c, batch = 0, min(10 * numPartitions, 1000) for k, v in iterator: buckets[partitionFunc(k) % numPartitions].append((k, v)) c += 1 # check used memory and avg size of chunk of objects if (c % 1000 == 0 and get_used_memory() > limit or c > batch): n, size = len(buckets), 0 for split in buckets.keys(): yield pack_long(split) d = outputSerializer.dumps(buckets[split]) del buckets[split] yield d size += len(d) avg = (size / n) >> 20 # let 1M < avg < 10M if avg < 1: batch *= 1.5 elif avg > 10: batch = max(batch / 1.5, 1) c = 0 for split, items in buckets.iteritems(): yield pack_long(split) yield outputSerializer.dumps(items) keyed = self._mapPartitionsWithIndex(add_shuffle_key) keyed._bypass_serializer = True with SCCallSiteSync(self.ctx) as css: partitioner = self.ctx._jvm.PythonPartitioner( numPartitions, id(partitionFunc)) jdstream = self.ctx._jvm.PythonPairwiseDStream( keyed._jdstream.dstream(), partitioner).asJavaDStream() dstream = DStream(jdstream, self._ssc, BatchedSerializer(outputSerializer)) # This is required so that id(partitionFunc) remains unique, even if # partitionFunc is a lambda: dstream._partitionFunc = partitionFunc return dstream