Пример #1
0
    def create_array(s, t):
        mask = s.isnull()
        # Ensure timestamp series are in expected form for Spark internal representation
        # TODO: maybe don't need None check anymore as of Arrow 0.9.1
        if t is not None and pa.types.is_timestamp(t):
            s = _check_series_convert_timestamps_internal(s.fillna(0), timezone)
            # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2
            return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False)
        elif t is not None and pa.types.is_string(t) and sys.version < '3':
            # TODO: need decode before converting to Arrow in Python 2
            # TODO: don't need as of Arrow 0.9.1
            return pa.Array.from_pandas(s.apply(
                lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t)
        elif t is not None and pa.types.is_decimal(t) and \
                LooseVersion("0.9.0") <= LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
            # TODO: see ARROW-2432. Remove when the minimum PyArrow version becomes 0.10.0.
            return pa.Array.from_pandas(s.apply(
                lambda v: decimal.Decimal('NaN') if v is None else v), mask=mask, type=t)
        elif LooseVersion(pa.__version__) < LooseVersion("0.11.0"):
            # TODO: see ARROW-1949. Remove when the minimum PyArrow version becomes 0.11.0.
            return pa.Array.from_pandas(s, mask=mask, type=t)

        try:
            array = pa.Array.from_pandas(s, mask=mask, type=t, safe=safecheck)
        except pa.ArrowException as e:
            error_msg = "Exception thrown when converting pandas.Series (%s) to Arrow " + \
                        "Array (%s). It can be caused by overflows or other unsafe " + \
                        "conversions warned by Arrow. Arrow safe type check can be " + \
                        "disabled by using SQL config " + \
                        "`spark.sql.execution.pandas.arrowSafeTypeConversion`."
            raise RuntimeError(error_msg % (s.dtype, t), e)
        return array
Пример #2
0
 def create_array(s, t):
     mask = s.isnull()
     # Ensure timestamp series are in expected form for Spark internal representation
     # TODO: maybe don't need None check anymore as of Arrow 0.9.1
     if t is not None and pa.types.is_timestamp(t):
         s = _check_series_convert_timestamps_internal(
             s.fillna(0), timezone)
         # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2
         return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False)
     elif t is not None and pa.types.is_string(t) and sys.version < '3':
         # TODO: need decode before converting to Arrow in Python 2
         # TODO: don't need as of Arrow 0.9.1
         return pa.Array.from_pandas(s.apply(lambda v: v.decode("utf-8")
                                             if isinstance(v, str) else v),
                                     mask=mask,
                                     type=t)
     elif t is not None and pa.types.is_decimal(t) and \
             LooseVersion("0.9.0") <= LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
         # TODO: see ARROW-2432. Remove when the minimum PyArrow version becomes 0.10.0.
         return pa.Array.from_pandas(
             s.apply(lambda v: decimal.Decimal('NaN') if v is None else v),
             mask=mask,
             type=t)
     elif LooseVersion(pa.__version__) < LooseVersion("0.11.0"):
         # TODO: see ARROW-1949. Remove when the minimum PyArrow version becomes 0.11.0.
         return pa.Array.from_pandas(s, mask=mask, type=t)
     return pa.Array.from_pandas(s, mask=mask, type=t, safe=False)
Пример #3
0
 def create_array(s, t):
     mask = s.isnull()
     # Ensure timestamp series are in expected form for Spark internal representation
     if t is not None and pa.types.is_timestamp(t):
         s = _check_series_convert_timestamps_internal(s.fillna(0), timezone)
         # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2
         return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False)
     return pa.Array.from_pandas(s, mask=mask, type=t)
Пример #4
0
 def create_array(s, t):
     mask = s.isnull()
     # Ensure timestamp series are in expected form for Spark internal representation
     if t is not None and pa.types.is_timestamp(t):
         s = _check_series_convert_timestamps_internal(s.fillna(0), timezone)
         # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2
         return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False)
     return pa.Array.from_pandas(s, mask=mask, type=t)
Пример #5
0
 def create_array(s, t):
     mask = s.isnull()
     # Ensure timestamp series are in expected form for Spark internal representation
     if t is not None and pa.types.is_timestamp(t):
         s = _check_series_convert_timestamps_internal(s.fillna(0), timezone)
         # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2
         return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False)
     elif t is not None and pa.types.is_string(t) and sys.version < '3':
         # TODO: need decode before converting to Arrow in Python 2
         return pa.Array.from_pandas(s.apply(
             lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t)
     return pa.Array.from_pandas(s, mask=mask, type=t)
Пример #6
0
 def cast_series(s, t):
     if type(t) == pa.TimestampType:
         # NOTE: convert to 'us' with astype here, unit ignored in `from_pandas` see ARROW-1680
         return _check_series_convert_timestamps_internal(s.fillna(0))\
             .values.astype('datetime64[us]', copy=False)
     elif t == pa.date32():
         # TODO: this converts the series to Python objects, possibly avoid with Arrow >= 0.8
         return s.dt.date
     elif t is None or s.dtype == t.to_pandas_dtype():
         return s
     else:
         return s.fillna(0).astype(t.to_pandas_dtype(), copy=False)
Пример #7
0
 def cast_series(s, t):
     if type(t) == pa.TimestampType:
         # NOTE: convert to 'us' with astype here, unit ignored in `from_pandas` see ARROW-1680
         return _check_series_convert_timestamps_internal(s.fillna(0))\
             .values.astype('datetime64[us]', copy=False)
     elif t == pa.date32():
         # TODO: this converts the series to Python objects, possibly avoid with Arrow >= 0.8
         return s.dt.date
     elif t is None or s.dtype == t.to_pandas_dtype():
         return s
     else:
         return s.fillna(0).astype(t.to_pandas_dtype(), copy=False)
Пример #8
0
 def create_array(s, t):
     mask = s.isnull()
     # Ensure timestamp series are in expected form for Spark internal representation
     if t is not None and pa.types.is_timestamp(t):
         s = _check_series_convert_timestamps_internal(s, self._timezone)
     try:
         array = pa.Array.from_pandas(s, mask=mask, type=t, safe=self._safecheck)
     except pa.ArrowException as e:
         error_msg = "Exception thrown when converting pandas.Series (%s) to Arrow " + \
                     "Array (%s). It can be caused by overflows or other unsafe " + \
                     "conversions warned by Arrow. Arrow safe type check can be " + \
                     "disabled by using SQL config " + \
                     "`spark.sql.execution.pandas.arrowSafeTypeConversion`."
         raise RuntimeError(error_msg % (s.dtype, t), e)
     return array
Пример #9
0
 def create_array(s, t):
     mask = s.isnull()
     # Ensure timestamp series are in expected form for Spark internal representation
     if t is not None and pa.types.is_timestamp(t):
         s = _check_series_convert_timestamps_internal(s.fillna(0), timezone)
         # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2
         return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False)
     elif t is not None and pa.types.is_string(t) and sys.version < '3':
         # TODO: need decode before converting to Arrow in Python 2
         return pa.Array.from_pandas(s.apply(
             lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t)
     elif t is not None and pa.types.is_decimal(t) and \
             LooseVersion("0.9.0") <= LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
         # TODO: see ARROW-2432. Remove when the minimum PyArrow version becomes 0.10.0.
         return pa.Array.from_pandas(s.apply(
             lambda v: decimal.Decimal('NaN') if v is None else v), mask=mask, type=t)
     return pa.Array.from_pandas(s, mask=mask, type=t)
Пример #10
0
        def create_array(s, t):
            mask = s.isnull()
            # Ensure timestamp series are in expected form for Spark internal representation
            if t is not None and pa.types.is_timestamp(t):
                s = _check_series_convert_timestamps_internal(s.fillna(0), self._timezone)
                # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2
                return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False)

            try:
                array = pa.Array.from_pandas(s, mask=mask, type=t, safe=self._safecheck)
            except pa.ArrowException as e:
                error_msg = "Exception thrown when converting pandas.Series (%s) to Arrow " + \
                            "Array (%s). It can be caused by overflows or other unsafe " + \
                            "conversions warned by Arrow. Arrow safe type check can be " + \
                            "disabled by using SQL config " + \
                            "`spark.sql.execution.pandas.arrowSafeTypeConversion`."
                raise RuntimeError(error_msg % (s.dtype, t), e)
            return array
Пример #11
0
        def create_array(s, t):
            mask = s.isnull()
            # Ensure timestamp series are in expected form for Spark internal representation
            # TODO: maybe don't need None check anymore as of Arrow 0.9.1
            if t is not None and pa.types.is_timestamp(t):
                s = _check_series_convert_timestamps_internal(
                    s.fillna(0), self._timezone)
                # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2
                return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False)
            elif t is not None and pa.types.is_string(t) and sys.version < '3':
                # TODO: need decode before converting to Arrow in Python 2
                # TODO: don't need as of Arrow 0.9.1
                return pa.Array.from_pandas(s.apply(
                    lambda v: v.decode("utf-8") if isinstance(v, str) else v),
                                            mask=mask,
                                            type=t)
            elif t is not None and pa.types.is_decimal(t) and \
                    LooseVersion("0.9.0") <= LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
                # TODO: see ARROW-2432. Remove when the minimum PyArrow version becomes 0.10.0.
                return pa.Array.from_pandas(s.apply(
                    lambda v: decimal.Decimal('NaN') if v is None else v),
                                            mask=mask,
                                            type=t)
            elif LooseVersion(pa.__version__) < LooseVersion("0.11.0"):
                # TODO: see ARROW-1949. Remove when the minimum PyArrow version becomes 0.11.0.
                return pa.Array.from_pandas(s, mask=mask, type=t)

            try:
                array = pa.Array.from_pandas(s,
                                             mask=mask,
                                             type=t,
                                             safe=self._safecheck)
            except pa.ArrowException as e:
                error_msg = "Exception thrown when converting pandas.Series (%s) to Arrow " + \
                            "Array (%s). It can be caused by overflows or other unsafe " + \
                            "conversions warned by Arrow. Arrow safe type check can be " + \
                            "disabled by using SQL config " + \
                            "`spark.sql.execution.pandas.arrowSafeTypeConversion`."
                raise RuntimeError(error_msg % (s.dtype, t), e)
            return array