示例#1
0
    def normalize_binop_value(self, other):
        if isinstance(other, dt.datetime):
            other = np.datetime64(other)
        elif isinstance(other, dt.timedelta):
            other = np.timedelta64(other)
        elif isinstance(other, pd.Timestamp):
            other = other.to_datetime64()
        elif isinstance(other, pd.Timedelta):
            other = other.to_timedelta64()

        if isinstance(other, np.datetime64):
            if np.isnat(other):
                return as_scalar(val=None, dtype=self.dtype)

            other = other.astype(self.dtype)
            return as_scalar(other)
        elif isinstance(other, np.timedelta64):
            other_time_unit = cudf.utils.dtypes.get_time_unit(other)

            if other_time_unit not in ("s", "ms", "ns", "us"):
                other = other.astype("timedelta64[s]")

            if np.isnat(other):
                return as_scalar(val=None, dtype=other.dtype)

            return as_scalar(other)
        else:
            raise TypeError("cannot normalize {}".format(type(other)))
示例#2
0
    def _binary_op_floordiv(self, rhs):
        lhs, rhs = self, rhs
        if pd.api.types.is_timedelta64_dtype(rhs.dtype):
            common_dtype = determine_out_dtype(self.dtype, rhs.dtype)
            lhs = lhs.astype(common_dtype).astype("float64")

            if isinstance(rhs, Scalar):
                if rhs.is_valid():
                    if isinstance(rhs, Scalar):
                        rhs = np.timedelta64(rhs.value)

                    rhs = rhs.astype(common_dtype).astype("float64")
                else:
                    rhs = as_scalar(None, "float64")
            else:
                rhs = rhs.astype(common_dtype).astype("float64")

            out_dtype = np.dtype("int64")
        elif rhs.dtype.kind in ("f", "i", "u"):
            out_dtype = self.dtype
        else:
            raise TypeError(
                f"Floor Division of {self.dtype} with {rhs.dtype} "
                f"cannot be performed."
            )

        return lhs, rhs, out_dtype
示例#3
0
def _process_col(col, unit, dayfirst, infer_datetime_format, format):
    if col.dtype.kind == "M":
        return col

    if col.dtype.kind in ("f"):
        if unit not in (None, "ns"):
            factor = as_scalar(
                column.datetime._numpy_to_pandas_conversion[unit]
            )
            col = col.binary_operator(binop="mul", rhs=factor)
        col = col.as_datetime_column(dtype="datetime64[ns]")
    if col.dtype.kind in ("i"):
        if unit in ("D", "h", "m"):
            factor = as_scalar(
                column.datetime._numpy_to_pandas_conversion[unit]
                / column.datetime._numpy_to_pandas_conversion["s"]
            )
            col = col.binary_operator(binop="mul", rhs=factor)

        col = col.as_datetime_column(dtype=_unit_dtype_map[unit])
    elif col.dtype.kind in ("O"):
        if unit not in (None, "ns"):
            try:
                col = col.astype(dtype="int64")
            except ValueError:
                col = col.astype(dtype="float64")
            return _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )
        else:
            if infer_datetime_format and format is None:
                format = column.datetime.infer_format(
                    element=col[0], dayfirst=dayfirst,
                )
            elif format is None:
                format = column.datetime.infer_format(element=col[0])
            col = col.as_datetime_column(
                dtype=_unit_dtype_map[unit], format=format,
            )
    return col
示例#4
0
    def quantile(self, q, interpolation, exact):
        result = self.as_numerical.quantile(q=q,
                                            interpolation=interpolation,
                                            exact=exact)
        if isinstance(q, Number):
            return pd.Timestamp(result, unit=self.time_unit)

        result = result.binary_operator(
            "mul", as_scalar(_numpy_to_pandas_conversion[self.time_unit]))

        return result.astype("datetime64[ns]")
示例#5
0
    def seconds(self):
        """
        Number of seconds (>= 0 and less than 1 day).

        Returns
        -------
        NumericalColumn
        """
        # This property must return the number of seconds (>= 0 and
        # less than 1 day) for each element, hence first performing
        # mod operation to remove the number of days and then performing
        # division operation to extract the number of seconds.

        return self.binary_operator(
            "mod",
            as_scalar(np.timedelta64(_numpy_to_pandas_conversion["D"], "ns")),
        ).binary_operator(
            "floordiv",
            as_scalar(np.timedelta64(_numpy_to_pandas_conversion["s"], "ns")),
        )
示例#6
0
    def nanoseconds(self):
        """
        Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.

        Returns
        -------
        NumericalColumn
        """
        # This property must return the number of nanoseconds (>= 0 and
        # less than 1 microsecond) for each element, hence first performing
        # mod operation to remove the number of microseconds and then
        # performing division operation to extract the number
        # of nanoseconds.

        return self.binary_operator(
            "mod",
            as_scalar(np.timedelta64(_numpy_to_pandas_conversion["us"], "ns")),
        ).binary_operator(
            "floordiv",
            as_scalar(np.timedelta64(_numpy_to_pandas_conversion["ns"], "ns")),
        )
示例#7
0
    def days(self):
        """
        Number of days for each element.

        Returns
        -------
        NumericalColumn
        """
        return self.binary_operator(
            "floordiv",
            as_scalar(np.timedelta64(_numpy_to_pandas_conversion["D"], "ns")),
        )
示例#8
0
def to_datetime(
    arg,
    errors="raise",
    dayfirst=False,
    yearfirst=False,
    utc=None,
    format=None,
    exact=True,
    unit="ns",
    infer_datetime_format=False,
    origin="unix",
    cache=True,
):
    """
    Convert argument to datetime.

    Parameters
    ----------
    arg : int, float, str, datetime, list, tuple, 1-d array,
        Series DataFrame/dict-like
        The object to convert to a datetime.
    errors : {'ignore', 'raise', 'coerce', 'warn'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception.
        - If 'coerce', then invalid parsing will be set as NaT.
        - If 'warn' : prints last exceptions as warnings and
            return the input.
        - If 'ignore', then invalid parsing will return the input.
    dayfirst : bool, default False
        Specify a date parse order if `arg` is str or its list-likes.
        If True, parses dates with the day first, eg 10/11/12 is parsed as
        2012-11-10.
        Warning: dayfirst=True is not strict, but will prefer to parse
        with day first (this is a known bug, based on dateutil behavior).
    format : str, default None
        The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
        all the way up to nanoseconds.
        See strftime documentation for more information on choices:
        https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.
    unit : str, default 'ns'
        The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
        integer or float number. This will be based off the
        origin(unix epoch start).
        Example, with unit='ms' and origin='unix' (the default), this
        would calculate the number of milliseconds to the unix epoch start.
    infer_datetime_format : bool, default False
        If True and no `format` is given, attempt to infer the format of the
        datetime strings, and if it can be inferred, switch to a faster
        method of parsing them. In some cases this can increase the parsing
        speed by ~5-10x.

    Returns
    -------
    datetime
        If parsing succeeded.
        Return type depends on input:
        - list-like: DatetimeIndex
        - Series: Series of datetime64 dtype
        - scalar: Timestamp

    Examples
    --------
    Assembling a datetime from multiple columns of a DataFrame. The keys can be
    common abbreviations like ['year', 'month', 'day', 'minute', 'second',
    'ms', 'us', 'ns']) or plurals of the same
    >>> df = cudf.DataFrame({'year': [2015, 2016],
    ...                    'month': [2, 3],
    ...                    'day': [4, 5]})
    >>> cudf.to_datetime(df)
    0   2015-02-04
    1   2016-03-05
    dtype: datetime64[ns]
    >>> cudf.to_datetime(1490195805, unit='s')
    numpy.datetime64('2017-03-22T15:16:45.000000000')
    >>> cudf.to_datetime(1490195805433502912, unit='ns')
    numpy.datetime64('1780-11-20T01:02:30.494253056')
    """
    if arg is None:
        return None

    if exact is False:
        raise NotImplementedError("exact support is not yet implemented")

    if origin != "unix":
        raise NotImplementedError("origin support is not yet implemented")

    if yearfirst:
        raise NotImplementedError("yearfirst support is not yet implemented")

    try:
        if isinstance(arg, cudf.DataFrame):
            # we require at least Ymd
            required = ["year", "month", "day"]
            req = list(set(required) - set(arg._data.names))
            if len(req):
                req = ",".join(req)
                raise ValueError(
                    "to assemble mappings requires at least that "
                    f"[year, month, day] be specified: [{req}] "
                    "is missing"
                )

            # replace passed column name with values in _unit_map
            unit = {k: get_units(k) for k in arg._data.names}
            unit_rev = {v: k for k, v in unit.items()}

            # keys we don't recognize
            excess = set(unit_rev.keys()) - set(_unit_map.values())
            if len(excess):
                excess = ",".join(excess)
                raise ValueError(
                    f"extra keys have been passed to the \
                        datetime assemblage: [{excess}]"
                )

            new_series = (
                arg[unit_rev["year"]].astype("str")
                + "-"
                + arg[unit_rev["month"]].astype("str").str.zfill(2)
                + "-"
                + arg[unit_rev["day"]].astype("str").str.zfill(2)
            )
            format = "%Y-%m-%d"
            col = new_series._column.as_datetime_column(
                "datetime64[s]", format=format
            )

            for u in ["h", "m", "s", "ms", "us", "ns"]:
                value = unit_rev.get(u)
                if value is not None and value in arg:
                    arg_col = arg._data[value]
                    if arg_col.dtype.kind in ("f"):
                        col = new_series._column.as_datetime_column(
                            "datetime64[ns]", format=format
                        )
                        break
                    elif arg_col.dtype.kind in ("O"):
                        if not cpp_is_integer(arg_col).all():
                            col = new_series._column.as_datetime_column(
                                "datetime64[ns]", format=format
                            )
                            break

            times_column = None
            for u in ["h", "m", "s", "ms", "us", "ns"]:
                value = unit_rev.get(u)
                if value is not None and value in arg:
                    current_col = arg._data[value]
                    # If the arg[value] is of int or
                    # float dtype we don't want to type-cast
                    if current_col.dtype.kind in ("O"):
                        try:
                            current_col = current_col.astype(dtype="int64")
                        except ValueError:
                            current_col = current_col.astype(dtype="float64")

                    factor = as_scalar(
                        column.datetime._numpy_to_pandas_conversion[u]
                        / (
                            column.datetime._numpy_to_pandas_conversion["s"]
                            if np.datetime_data(col.dtype)[0] == "s"
                            else 1
                        )
                    )

                    if times_column is None:
                        times_column = current_col.binary_operator(
                            binop="mul", rhs=factor
                        )
                    else:
                        times_column = times_column.binary_operator(
                            binop="add",
                            rhs=current_col.binary_operator(
                                binop="mul", rhs=factor
                            ),
                        )
            if times_column is not None:
                col = (
                    col.astype(dtype="int64")
                    .binary_operator(binop="add", rhs=times_column)
                    .astype(dtype=col.dtype)
                )
            return cudf.Series(col, index=arg.index)
        elif isinstance(arg, cudf.Index):
            col = arg._values
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )
            return as_index(col, name=arg.name)
        elif isinstance(arg, cudf.Series):
            col = arg._column
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )
            return cudf.Series(col, index=arg.index, name=arg.name)
        else:
            col = column.as_column(arg)
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )

            if is_scalar(arg):
                return col[0]
            else:
                return as_index(col)
    except Exception as e:
        if errors == "raise":
            raise e
        elif errors == "warn":
            import traceback

            tb = traceback.format_exc()
            warnings.warn(tb)
        elif errors == "ignore":
            pass
        elif errors == "coerce":
            return np.datetime64("nat", "ns" if unit is None else unit)
        return arg
示例#9
0
    def components(self, index=None):
        """
        Return a Dataframe of the components of the Timedeltas.

        Returns
        -------
        DataFrame

        Examples
        --------
        >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s'))
        >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656,
        ...     3244334234], dtype='timedelta64[ms]')
        >>> s
        0      141 days 13:35:12.123
        1       14 days 06:00:31.231
        2    13000 days 10:12:48.712
        3        0 days 00:35:35.656
        4       37 days 13:12:14.234
        dtype: timedelta64[ms]
        >>> s.dt.components
            days  hours  minutes  seconds  milliseconds  microseconds  nanoseconds
        0    141     13       35       12           123             0            0
        1     14      6        0       31           231             0            0
        2  13000     10       12       48           712             0            0
        3      0      0       35       35           656             0            0
        4     37     13       12       14           234             0            0
        """  # noqa: E501

        return cudf.DataFrame(
            data={
                "days": self.binary_operator(
                    "floordiv",
                    as_scalar(
                        np.timedelta64(_numpy_to_pandas_conversion["D"], "ns")
                    ),
                ),
                "hours": self.binary_operator(
                    "mod",
                    as_scalar(
                        np.timedelta64(_numpy_to_pandas_conversion["D"], "ns")
                    ),
                ).binary_operator(
                    "floordiv",
                    as_scalar(
                        np.timedelta64(_numpy_to_pandas_conversion["h"], "ns")
                    ),
                ),
                "minutes": self.binary_operator(
                    "mod",
                    as_scalar(
                        np.timedelta64(_numpy_to_pandas_conversion["h"], "ns")
                    ),
                ).binary_operator(
                    "floordiv",
                    as_scalar(
                        np.timedelta64(_numpy_to_pandas_conversion["m"], "ns")
                    ),
                ),
                "seconds": self.binary_operator(
                    "mod",
                    as_scalar(
                        np.timedelta64(_numpy_to_pandas_conversion["m"], "ns")
                    ),
                ).binary_operator(
                    "floordiv",
                    as_scalar(
                        np.timedelta64(_numpy_to_pandas_conversion["s"], "ns")
                    ),
                ),
                "milliseconds": self.binary_operator(
                    "mod",
                    as_scalar(
                        np.timedelta64(_numpy_to_pandas_conversion["s"], "ns")
                    ),
                ).binary_operator(
                    "floordiv",
                    as_scalar(
                        np.timedelta64(_numpy_to_pandas_conversion["ms"], "ns")
                    ),
                ),
                "microseconds": self.binary_operator(
                    "mod",
                    as_scalar(
                        np.timedelta64(_numpy_to_pandas_conversion["ms"], "ns")
                    ),
                ).binary_operator(
                    "floordiv",
                    as_scalar(
                        np.timedelta64(_numpy_to_pandas_conversion["us"], "ns")
                    ),
                ),
                "nanoseconds": self.binary_operator(
                    "mod",
                    as_scalar(
                        np.timedelta64(_numpy_to_pandas_conversion["us"], "ns")
                    ),
                ).binary_operator(
                    "floordiv",
                    as_scalar(
                        np.timedelta64(_numpy_to_pandas_conversion["ns"], "ns")
                    ),
                ),
            },
            index=index,
        )
示例#10
0
def _process_col(col, unit, dayfirst, infer_datetime_format, format):
    if col.dtype.kind == "M":
        return col
    elif col.dtype.kind == "m":
        raise TypeError(
            f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
        )

    if col.dtype.kind in ("f"):
        if unit not in (None, "ns"):
            factor = as_scalar(
                column.datetime._numpy_to_pandas_conversion[unit])
            col = col.binary_operator(binop="mul", rhs=factor)

        if format is not None:
            # Converting to int because,
            # pandas actually creates a datetime column
            # out of float values and then creates an
            # int column out of it to parse against `format`.
            # Instead we directly cast to int and perform
            # parsing against `format`.
            col = (col.astype("int").astype("str").as_datetime_column(
                dtype="datetime64[us]" if "%f" in format else "datetime64[s]",
                format=format,
            ))
        else:
            col = col.as_datetime_column(dtype="datetime64[ns]")

    if col.dtype.kind in ("i"):
        if unit in ("D", "h", "m"):
            factor = as_scalar(
                column.datetime._numpy_to_pandas_conversion[unit] /
                column.datetime._numpy_to_pandas_conversion["s"])
            col = col.binary_operator(binop="mul", rhs=factor)

        if format is not None:
            col = col.astype("str").as_datetime_column(
                dtype=_unit_dtype_map[unit], format=format)
        else:
            col = col.as_datetime_column(dtype=_unit_dtype_map[unit])

    elif col.dtype.kind in ("O"):
        if unit not in (None, "ns"):
            try:
                col = col.astype(dtype="int64")
            except ValueError:
                col = col.astype(dtype="float64")
            return _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )
        else:
            if infer_datetime_format and format is None:
                format = column.datetime.infer_format(
                    element=col[0],
                    dayfirst=dayfirst,
                )
            elif format is None:
                format = column.datetime.infer_format(element=col[0])
            col = col.as_datetime_column(
                dtype=_unit_dtype_map[unit],
                format=format,
            )
    return col