Exemplo n.º 1
0
    def _collect_as_arrow(self):
        """
        Returns all records as a list of ArrowRecordBatches, pyarrow must be installed
        and available on driver and worker Python environments.

        .. note:: Experimental.
        """
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, DataFrame)

        with SCCallSiteSync(self._sc):
            port, auth_secret, jsocket_auth_server = self._jdf.collectAsArrowToPython(
            )

        # Collect list of un-ordered batches where last element is a list of correct order indices
        try:
            results = list(
                _load_from_socket((port, auth_secret),
                                  ArrowCollectSerializer()))
        finally:
            # Join serving thread and raise any exceptions from collectAsArrowToPython
            jsocket_auth_server.getResult()

        # Separate RecordBatches from batch order indices in results
        batches = results[:-1]
        batch_order = results[-1]

        # Re-order the batch list using the correct order
        return [batches[i] for i in batch_order]
Exemplo n.º 2
0
 def collect(self):
     """
     Return a list that contains all of the elements in this RDD.
     """
     with SCCallSiteSync(self.ctx) as css:
         bytesInJava = self.jedge_rdd.collect().iterator()
     return list(self._collect_iterator_through_file(bytesInJava))
Exemplo n.º 3
0
    def _collect_as_arrow(self, split_batches: bool = False) -> List["pa.RecordBatch"]:
        """
        Returns all records as a list of ArrowRecordBatches, pyarrow must be installed
        and available on driver and worker Python environments.
        This is an experimental feature.

        :param split_batches: split batches such that each column is in its own allocation, so
            that the selfDestruct optimization is effective; default False.

        .. note:: Experimental.
        """
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, DataFrame)

        with SCCallSiteSync(self._sc):
            (
                port,
                auth_secret,
                jsocket_auth_server,
            ) = self._jdf.collectAsArrowToPython()

        # Collect list of un-ordered batches where last element is a list of correct order indices
        try:
            batch_stream = _load_from_socket((port, auth_secret), ArrowCollectSerializer())
            if split_batches:
                # When spark.sql.execution.arrow.pyspark.selfDestruct.enabled, ensure
                # each column in each record batch is contained in its own allocation.
                # Otherwise, selfDestruct does nothing; it frees each column as its
                # converted, but each column will actually be a list of slices of record
                # batches, and so no memory is actually freed until all columns are
                # converted.
                import pyarrow as pa

                results = []
                for batch_or_indices in batch_stream:
                    if isinstance(batch_or_indices, pa.RecordBatch):
                        batch_or_indices = pa.RecordBatch.from_arrays(
                            [
                                # This call actually reallocates the array
                                pa.concat_arrays([array])
                                for array in batch_or_indices
                            ],
                            schema=batch_or_indices.schema,
                        )
                    results.append(batch_or_indices)
            else:
                results = list(batch_stream)
        finally:
            # Join serving thread and raise any exceptions from collectAsArrowToPython
            jsocket_auth_server.getResult()

        # Separate RecordBatches from batch order indices in results
        batches = results[:-1]
        batch_order = results[-1]

        # Re-order the batch list using the correct order
        return [batches[i] for i in batch_order]
Exemplo n.º 4
0
    def collect(self):
        """Returns all the records as a list of :class:`Row`.

        >>> df.collect()
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        """
        with SCCallSiteSync(self._sc) as css:
            port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd())
        return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
Exemplo n.º 5
0
    def take(self, num):
        """Returns the first ``num`` rows as a :class:`list` of :class:`Row`.

        >>> df.take(2)
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        """
        with SCCallSiteSync(self._sc) as css:
            port = self._sc._jvm.org.apache.spark.sql.execution.EvaluatePython.takeAndServe(
                self._jdf, num)
        return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
Exemplo n.º 6
0
    def collect(self):
        """Return a list that contains all of the rows.

        Each object in the list is a Row, the fields can be accessed as
        attributes.

        >>> df.collect()
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        """
        with SCCallSiteSync(self._sc) as css:
            port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd())
        rs = list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
        cls = _create_cls(self.schema)
        return [cls(r) for r in rs]
Exemplo n.º 7
0
    def collect(self):
        """Return a list that contains all of the rows.

        Each object in the list is a Row, the fields can be accessed as
        attributes.

        >>> df.collect()
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        """
        with SCCallSiteSync(self._sc) as css:
            bytesInJava = self._jdf.javaToPython().collect().iterator()
        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
        tempFile.close()
        self._sc._writeToFile(bytesInJava, tempFile.name)
        # Read the data into Python and deserialize it:
        with open(tempFile.name, 'rb') as tempFile:
            rs = list(
                BatchedSerializer(PickleSerializer()).load_stream(tempFile))
        os.unlink(tempFile.name)
        cls = _create_cls(self.schema)
        return [cls(r) for r in rs]
Exemplo n.º 8
0
    def partitionBy(self, numPartitions, partitionFunc=portable_hash):
        """
        Return a copy of the DStream partitioned using the specified partitioner.
        """
        if numPartitions is None:
            numPartitions = self.ctx._defaultReducePartitions()

        # Transferring O(n) objects to Java is too expensive.  Instead, we'll
        # form the hash buckets in Python, transferring O(numPartitions) objects
        # to Java.  Each object is a (splitNumber, [objects]) pair.

        outputSerializer = self.ctx._unbatched_serializer
        #
        #        def add_shuffle_key(split, iterator):
        #            buckets = defaultdict(list)
        #
        #            for (k, v) in iterator:
        #                buckets[partitionFunc(k) % numPartitions].append((k, v))
        #            for (split, items) in buckets.iteritems():
        #                yield pack_long(split)
        #                yield outputSerializer.dumps(items)
        #        keyed = PipelinedDStream(self, add_shuffle_key)

        limit = (_parse_memory(
            self.ctx._conf.get("spark.python.worker.memory", "512m")) / 2)

        def add_shuffle_key(split, iterator):

            buckets = defaultdict(list)
            c, batch = 0, min(10 * numPartitions, 1000)

            for k, v in iterator:
                buckets[partitionFunc(k) % numPartitions].append((k, v))
                c += 1

                # check used memory and avg size of chunk of objects
                if (c % 1000 == 0 and get_used_memory() > limit or c > batch):
                    n, size = len(buckets), 0
                    for split in buckets.keys():
                        yield pack_long(split)
                        d = outputSerializer.dumps(buckets[split])
                        del buckets[split]
                        yield d
                        size += len(d)

                    avg = (size / n) >> 20
                    # let 1M < avg < 10M
                    if avg < 1:
                        batch *= 1.5
                    elif avg > 10:
                        batch = max(batch / 1.5, 1)
                    c = 0

            for split, items in buckets.iteritems():
                yield pack_long(split)
                yield outputSerializer.dumps(items)

        keyed = self._mapPartitionsWithIndex(add_shuffle_key)

        keyed._bypass_serializer = True
        with SCCallSiteSync(self.ctx) as css:
            partitioner = self.ctx._jvm.PythonPartitioner(
                numPartitions, id(partitionFunc))
            jdstream = self.ctx._jvm.PythonPairwiseDStream(
                keyed._jdstream.dstream(), partitioner).asJavaDStream()
        dstream = DStream(jdstream, self._ssc,
                          BatchedSerializer(outputSerializer))
        # This is required so that id(partitionFunc) remains unique, even if
        # partitionFunc is a lambda:
        dstream._partitionFunc = partitionFunc
        return dstream