예제 #1
0
파일: session.py 프로젝트: CodingCat/spark
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.serializers import ArrowStreamSerializer, _create_batch
        from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
        from pyspark.sql.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [to_arrow_type(TimestampType())
                           if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                           for t in pdf.dtypes]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
        pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))

        # Create Arrow record batches
        safecheck = self._wrapped._conf.arrowSafeTypeConversion()
        batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
                                 timezone, safecheck)
                   for pdf_slice in pdf_slices]

        # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
        if isinstance(schema, (list, tuple)):
            struct = from_arrow_schema(batches[0].schema)
            for i, name in enumerate(schema):
                struct.fields[i].name = name
                struct.names[i] = name
            schema = struct

        jsqlContext = self._wrapped._jsqlContext

        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)

        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(batches, ArrowStreamSerializer(), reader_func,
                                          create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
예제 #2
0
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.serializers import ArrowStreamSerializer, _create_batch
        from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
        from pyspark.sql.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [to_arrow_type(TimestampType())
                           if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                           for t in pdf.dtypes]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
        pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))

        # Create Arrow record batches
        batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
                                 timezone)
                   for pdf_slice in pdf_slices]

        # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
        if isinstance(schema, (list, tuple)):
            struct = from_arrow_schema(batches[0].schema)
            for i, name in enumerate(schema):
                struct.fields[i].name = name
                struct.names[i] = name
            schema = struct

        jsqlContext = self._wrapped._jsqlContext

        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)

        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(batches, ArrowStreamSerializer(), reader_func,
                                          create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
예제 #3
0
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.serializers import ArrowSerializer, _create_batch
        from pyspark.sql.types import from_arrow_schema, to_arrow_type, \
            _old_pandas_exception_message, TimestampType
        try:
            from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
        except ImportError as e:
            raise ImportError(_old_pandas_exception_message(e))

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError(
                "Single data type %s is not supported with Arrow" %
                str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [
                to_arrow_type(TimestampType())
                if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                for t in pdf.dtypes
            ]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism
                 )  # round int up
        pdf_slices = (pdf[start:start + step]
                      for start in xrange(0, len(pdf), step))

        # Create Arrow record batches
        batches = [
            _create_batch(
                [(c, t)
                 for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
                timezone) for pdf_slice in pdf_slices
        ]

        # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
        if isinstance(schema, (list, tuple)):
            struct = from_arrow_schema(batches[0].schema)
            for i, name in enumerate(schema):
                struct.fields[i].name = name
                struct.names[i] = name
            schema = struct

        # Create the Spark DataFrame directly from the Arrow data and schema
        jrdd = self._sc._serialize_to_jvm(batches, len(batches),
                                          ArrowSerializer())
        jdf = self._jvm.PythonSQLUtils.arrowPayloadToDataFrame(
            jrdd, schema.json(), self._wrapped._jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
예제 #4
0
파일: session.py 프로젝트: aa8y/spark
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.serializers import ArrowSerializer, _create_batch
        from pyspark.sql.types import from_arrow_schema, to_arrow_type, \
            _old_pandas_exception_message, TimestampType
        from pyspark.sql.utils import _require_minimum_pyarrow_version
        try:
            from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
        except ImportError as e:
            raise ImportError(_old_pandas_exception_message(e))

        _require_minimum_pyarrow_version()

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [to_arrow_type(TimestampType())
                           if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                           for t in pdf.dtypes]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
        pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))

        # Create Arrow record batches
        batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
                                 timezone)
                   for pdf_slice in pdf_slices]

        # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
        if isinstance(schema, (list, tuple)):
            struct = from_arrow_schema(batches[0].schema)
            for i, name in enumerate(schema):
                struct.fields[i].name = name
                struct.names[i] = name
            schema = struct

        # Create the Spark DataFrame directly from the Arrow data and schema
        jrdd = self._sc._serialize_to_jvm(batches, len(batches), ArrowSerializer())
        jdf = self._jvm.PythonSQLUtils.arrowPayloadToDataFrame(
            jrdd, schema.json(), self._wrapped._jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df