def test_reflected_ops_cudf_scalar(funcs, dtype, obj_class): cpu_func, gpu_func = funcs # create random series np.random.seed(12) random_series = utils.gen_rand(dtype, 100, low=10) # gpu series gs = Series(random_series) # class typing if obj_class == "Index": gs = as_index(gs) gs_result = gpu_func(gs) # class typing if obj_class == "Index": gs = Series(gs) # pandas ps_result = cpu_func(random_series) # verify np.testing.assert_allclose(ps_result, gs_result.to_array())
def _get_column_major(self, df, row_tuple): from cudf import Series from cudf import DataFrame valid_indices = self._get_valid_indices_by_tuple( df.columns, row_tuple, len(df._cols)) result = df._take_columns(valid_indices) if isinstance(row_tuple, (numbers.Number, slice)): row_tuple = [row_tuple] if len(result) == 0 and len(result.columns) == 0: result_columns = df.columns.copy(deep=False) clear_codes = DataFrame() for name in df.columns.names: clear_codes[name] = Series([]) result_columns._codes = clear_codes result_columns._source_data = clear_codes result.columns = result_columns elif len(row_tuple) < len( self.levels) and (not slice(None) in row_tuple and not isinstance(row_tuple[0], slice)): columns = self._popn(len(row_tuple)) result.columns = columns.take(valid_indices) else: result.columns = self.take(valid_indices) if len(result.columns.levels) == 1: columns = [] for code in result.columns.codes[result.columns.codes.columns[0]]: columns.append(result.columns.levels[0][code]) name = result.columns.names[0] result.columns = as_index(columns, name=name) if len(row_tuple) == len(self.levels) and len(result.columns) == 1: result = list(result._cols.values())[0] return result
def to_pandas(self, **kwargs): if hasattr(self, "_source_data"): result = self._source_data.to_pandas() result.columns = self.names return pd.MultiIndex.from_frame(result) pandas_codes = [] for code in self.codes.columns: pandas_codes.append(self.codes[code].to_array()) # We do two things here to mimic Pandas behavior: # 1. as_index() on each level, so DatetimeColumn becomes DatetimeIndex # 2. convert levels to numpy array so empty levels become Float64Index levels = np.array( [as_index(level).to_pandas() for level in self.levels] ) # Backwards compatibility: # Construct a dummy MultiIndex and check for the codes attr. # This indicates that it is pandas >= 0.24 # If no codes attr is present it is pandas <= 0.23 if hasattr(pd.MultiIndex([[]], [[]]), "codes"): pandas_mi = pd.MultiIndex(levels=levels, codes=pandas_codes) else: pandas_mi = pd.MultiIndex(levels=levels, labels=pandas_codes) if self.names is not None: pandas_mi.names = self.names return pandas_mi
def categories(self): """ The categories of this categorical. """ from cudf.core.index import as_index return as_index(self._column.categories)
def get_level_values(self, level): """ Return the values at the requested level Parameters ---------- level : int or label Returns ------- An Index containing the values at the requested level. """ colnames = list(self._source_data.columns) if level not in colnames: if isinstance(level, int): if level < 0: level = level + len(colnames) if level < 0 or level >= len(colnames): raise IndexError(f"Invalid level number: '{level}'") level_idx = level level = colnames[level_idx] elif level in self.names: level_idx = list(self.names).index(level) level = colnames[level_idx] else: raise KeyError(f"Level not found: '{level}'") else: level_idx = colnames.index(level) level_values = as_index( self._source_data._data[level], name=self.names[level_idx] ) return level_values
def test_categorical_basic(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) cudf_cat = as_index(cat) pdsr = pd.Series(cat, index=["p", "q", "r", "s", "t"]) sr = Series(cat, index=["p", "q", "r", "s", "t"]) assert_eq(pdsr.cat.codes, sr.cat.codes) # Test attributes assert_eq(pdsr.cat.categories, sr.cat.categories) assert pdsr.cat.ordered == sr.cat.ordered np.testing.assert_array_equal(pdsr.cat.codes.values, sr.cat.codes.to_array()) string = str(sr) expect_str = """ p a q a r b s c t a """ assert all(x == y for x, y in zip(string.split(), expect_str.split())) assert_eq(cat.codes, cudf_cat.codes.to_array())
def test_categorical_basic(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) cudf_cat = as_index(cat) pdsr = pd.Series(cat) sr = Series(cat) np.testing.assert_array_equal(cat.codes, sr.to_array()) # Test attributes assert tuple(pdsr.cat.categories) == tuple(sr.cat.categories) assert pdsr.cat.ordered == sr.cat.ordered np.testing.assert_array_equal( pdsr.cat.codes.values, sr.cat.codes.to_array() ) np.testing.assert_array_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype) string = str(sr) expect_str = """ 0 a 1 a 2 b 3 c 4 a """ assert all(x == y for x, y in zip(string.split(), expect_str.split())) assert_eq(cat.codes, cudf_cat.codes.to_array())
def _getitem_tuple_arg(self, arg): from cudf.core.dataframe import DataFrame from cudf.core.column import column from cudf.core.index import as_index from cudf.utils.cudautils import arange from cudf import MultiIndex # Step 1: Gather columns if isinstance(self._df.columns, MultiIndex): columns_df = self._df.columns._get_column_major(self._df, arg[1]) else: columns = self._get_column_selection(arg[1]) columns_df = DataFrame() for col in columns: columns_df.add_column(name=col, data=self._df[col]) # Step 2: Gather rows if isinstance(columns_df.index, MultiIndex): return columns_df.index._get_row_major(columns_df, arg[0]) else: if isinstance(self._df.columns, MultiIndex): if isinstance(arg[0], slice): start, stop, step = arg[0].indices(len(columns_df)) indices = arange(start, stop, step) df = columns_df.take(indices) else: df = columns_df.take(arg[0]) else: df = DataFrame() for col in columns_df.columns: df[col] = columns_df[col].loc[arg[0]] # Step 3: Gather index if df.shape[0] == 1: # we have a single row if isinstance(arg[0], slice): start = arg[0].start if start is None: start = self._df.index[0] df.index = as_index(start) else: row_selection = column.as_column(arg[0]) if pd.api.types.is_bool_dtype(row_selection.dtype): df.index = self._df.index.take(row_selection) else: df.index = as_index(row_selection) # Step 4: Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df
def _index_and_downcast(self, result, index, index_key): if isinstance(index_key, (numbers.Number, slice)): index_key = [index_key] if ( len(index_key) > 0 and not isinstance(index_key, tuple) ) or isinstance(index_key[0], slice): index_key = index_key[0] slice_access = False if isinstance(index_key, slice): slice_access = True out_index = cudf.DataFrame() # Select the last n-k columns where n is the number of _source_data # columns and k is the length of the indexing tuple size = 0 if not isinstance(index_key, (numbers.Number, slice)): size = len(index_key) for k in range(size, len(index._source_data.columns)): if index.names is None: name = k else: name = index.names[k] out_index.insert( len(out_index.columns), name, index._source_data[index._source_data.columns[k]], ) if len(result) == 1 and size == 0 and slice_access is False: # If the final result is one row and it was not mapped into # directly, return a Series with a tuple as name. result = result.T result = result[result._data.names[0]] elif len(result) == 0 and slice_access is False: # Pandas returns an empty Series with a tuple as name # the one expected result column series_name = [] for idx, code in enumerate(index._source_data.columns): series_name.append(index._source_data[code][0]) result = cudf.Series([]) result.name = tuple(series_name) elif len(out_index.columns) == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to the _source_data column names last_column = index._source_data.columns[-1] out_index = index._source_data[last_column] out_index = as_index(out_index) out_index.name = index.names[len(index.names) - 1] index = out_index elif len(out_index.columns) > 1: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) index = index._popn(size) if isinstance(index_key, tuple): result = result.set_index(index) return result
def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype): arr1 = (np.random.random(100) * 100).astype(lhs_dtype) sr1 = Series(arr1) arr2 = (np.random.random(100) * 100).astype(rhs_dtype) sr2 = Series(arr2) if obj_class == "Index": sr1 = as_index(sr1) sr2 = as_index(sr2) result = binop(sr1, sr2) if obj_class == "Index": result = Series(result) np.testing.assert_almost_equal(result.to_array(), binop(arr1, arr2))
def _getitem_tuple_arg(self, arg): from cudf import MultiIndex from cudf.core.column import column from cudf.core.index import as_index # Iloc Step 1: # Gather the columns specified by the second tuple arg columns_df = self._get_column_selection(arg[1]) columns_df._index = self._df._index # Iloc Step 2: # Gather the rows specified by the first tuple arg if isinstance(columns_df.index, MultiIndex): if isinstance(arg[0], slice): df = columns_df[arg[0]] else: df = columns_df.index._get_row_major(columns_df, arg[0]) if (len(df) == 1 and len(columns_df) >= 1) and not ( isinstance(arg[0], slice) or isinstance(arg[1], slice) ): # Pandas returns a numpy scalar in this case return df.iloc[0] if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df else: if isinstance(arg[0], slice): df = columns_df._slice(arg[0]) elif is_scalar(arg[0]): index = arg[0] if index < 0: index += len(columns_df) df = columns_df._slice(slice(index, index + 1, 1)) else: arg = (column.as_column(arg[0]), arg[1]) if pd.api.types.is_bool_dtype(arg[0]): df = columns_df._apply_boolean_mask(arg[0]) else: df = columns_df._gather(arg[0]) # Iloc Step 3: # Reindex if df.shape[0] == 1: # we have a single row without an index df.index = as_index(self._df.index[arg[0]]) # Iloc Step 4: # Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice): from cudf.core.index import RangeIndex slice_len = len(self._df) start, stop, step = arg[0].indices(slice_len) df._index = RangeIndex(start, stop) return df
def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class): nelem = 5 lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype) rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype) sr1 = Series(lhs) sr2 = Series(rhs) if obj_class == "Index": sr1 = as_index(sr1) sr2 = as_index(sr2) result = cmpop(Series(sr1), Series(sr2)) if obj_class == "Index": result = Series(result) np.testing.assert_array_equal(result.to_array(), cmpop(lhs, rhs))
def sort_values(self, return_indexer=False, ascending=True, key=None): if key is not None: raise NotImplementedError("key parameter is not yet implemented.") indices = self._source_data.argsort(ascending=ascending) index_sorted = as_index(self.take(indices), name=self.names) if return_indexer: return index_sorted, cupy.asarray(indices) else: return index_sorted
def test_index_rename(): pds = pd.Index([1, 2, 3], name="asdf") gds = as_index(pds) expect = pds.rename("new_name") got = gds.rename("new_name") assert_eq(expect, got) """ From here on testing recursive creation and if name is being handles in recursive creation. """ pds = pd.Index(expect) gds = as_index(got) assert_eq(pds, gds) pds = pd.Index(pds, name="abc") gds = as_index(gds, name="abc") assert_eq(pds, gds)
def test_series_binop(binop, obj_class): nelem = 1000 arr1 = utils.gen_rand("float64", nelem) * 10000 # Keeping a low value because CUDA 'pow' has 2 full range error arr2 = utils.gen_rand("float64", nelem) * 10 sr1 = Series(arr1) sr2 = Series(arr2) if obj_class == "Index": sr1 = as_index(sr1) sr2 = as_index(sr2) result = binop(sr1, sr2) expect = binop(pd.Series(arr1), pd.Series(arr2)) if obj_class == "Index": result = Series(result) utils.assert_eq(result, expect)
def test_series_compare(cmpop, obj_class, dtype): arr1 = np.random.randint(0, 100, 100).astype(dtype) arr2 = np.random.randint(0, 100, 100).astype(dtype) sr1 = Series(arr1) sr2 = Series(arr2) if obj_class == "Index": sr1 = as_index(sr1) sr2 = as_index(sr2) result1 = cmpop(sr1, sr1) result2 = cmpop(sr2, sr2) result3 = cmpop(sr1, sr2) if obj_class == "Index": result1 = Series(result1) result2 = Series(result2) result3 = Series(result3) np.testing.assert_equal(result1.to_array(), cmpop(arr1, arr1)) np.testing.assert_equal(result2.to_array(), cmpop(arr2, arr2)) np.testing.assert_equal(result3.to_array(), cmpop(arr1, arr2))
def test_series_binop_scalar(nelem, binop, obj_class): arr = np.random.random(nelem) rhs = random.choice(arr).item() sr = Series(arr) if obj_class == "Index": sr = as_index(sr) result = binop(sr, rhs) if obj_class == "Index": result = Series(result) np.testing.assert_almost_equal(result.to_array(), binop(arr, rhs))
def test_index_rename_inplace(): pds = pd.Index([1, 2, 3], name="asdf") gds = as_index(pds) # inplace=False should yield a deep copy gds_renamed_deep = gds.rename("new_name", inplace=False) assert gds_renamed_deep._values.data_ptr != gds._values.data_ptr # inplace=True returns none expected_ptr = gds._values.data_ptr gds.rename("new_name", inplace=True) assert expected_ptr == gds._values.data_ptr
def test_index_rename_preserves_arg(): idx1 = GenericIndex([1, 2, 3], name="orig_name") # this should be an entirely new object idx2 = idx1.rename("new_name", inplace=False) assert idx2.name == "new_name" assert idx1.name == "orig_name" # a new object but referencing the same data idx3 = as_index(idx1, name="last_name") assert idx3.name == "last_name" assert idx1.name == "orig_name"
def test_pandas_as_index(): # Define Pandas Indexes pdf_int_index = pd.Int64Index([1, 2, 3, 4, 5]) pdf_uint_index = pd.UInt64Index([1, 2, 3, 4, 5]) pdf_float_index = pd.Float64Index([1.0, 2.0, 3.0, 4.0, 5.0]) pdf_datetime_index = pd.DatetimeIndex( [1000000, 2000000, 3000000, 4000000, 5000000] ) pdf_category_index = pd.CategoricalIndex(["a", "b", "c", "b", "a"]) # Define cudf Indexes gdf_int_index = as_index(pdf_int_index) gdf_uint_index = as_index(pdf_uint_index) gdf_float_index = as_index(pdf_float_index) gdf_datetime_index = as_index(pdf_datetime_index) gdf_category_index = as_index(pdf_category_index) # Check instance types assert isinstance(gdf_int_index, GenericIndex) assert isinstance(gdf_uint_index, GenericIndex) assert isinstance(gdf_float_index, GenericIndex) assert isinstance(gdf_datetime_index, DatetimeIndex) assert isinstance(gdf_category_index, CategoricalIndex) # Check equality assert_eq(pdf_int_index, gdf_int_index) assert_eq(pdf_uint_index, gdf_uint_index) assert_eq(pdf_float_index, gdf_float_index) assert_eq(pdf_datetime_index, gdf_datetime_index) assert_eq(pdf_category_index, gdf_category_index) assert_eq( pdf_category_index.codes, gdf_category_index.codes.astype( pdf_category_index.codes.dtype ).to_array(), )
def _getitem_tuple_arg(self, arg): from cudf.core.dataframe import Series, DataFrame from cudf.core.column import column from cudf.core.index import as_index from cudf import MultiIndex # Step 1: Gather columns columns_df = self._get_column_selection(arg[1]) columns_df._index = self._df._index # Step 2: Gather rows if isinstance(columns_df.index, MultiIndex): return columns_df.index._get_row_major(columns_df, arg[0]) else: df = DataFrame() for col in columns_df.columns: # need Series() in case a scalar is returned df[col] = Series(columns_df[col].loc[arg[0]]) df.columns = columns_df.columns # Step 3: Gather index if df.shape[0] == 1: # we have a single row if isinstance(arg[0], slice): start = arg[0].start if start is None: start = self._df.index[0] df.index = as_index(start) else: row_selection = column.as_column(arg[0]) if pd.api.types.is_bool_dtype(row_selection.dtype): df.index = self._df.index.take(row_selection) else: df.index = as_index(row_selection) # Step 4: Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df
def test_index_rename_inplace(): pds = pd.Index([1, 2, 3], name="asdf") gds = as_index(pds) # inplace=False should yield a deep copy gds_renamed_deep = gds.rename("new_name", inplace=False) gds._values.data.mem = GenericIndex([2, 3, 4])._values.data.mem assert (gds_renamed_deep.values == [1, 2, 3]).all() # inplace=True returns none gds_to_rename = gds gds.rename("new_name", inplace=True) gds._values.data.mem = GenericIndex([3, 4, 5])._values.data.mem assert (gds_to_rename.values == [3, 4, 5]).all()
def test_series_compare_scalar(nelem, cmpop, obj_class, dtype): arr1 = np.random.randint(0, 100, 100).astype(dtype) sr1 = Series(arr1) rhs = random.choice(arr1).item() if obj_class == "Index": sr1 = as_index(sr1) result1 = cmpop(sr1, rhs) result2 = cmpop(rhs, sr1) if obj_class == "Index": result1 = Series(result1) result2 = Series(result2) np.testing.assert_equal(result1.to_array(), cmpop(arr1, rhs)) np.testing.assert_equal(result2.to_array(), cmpop(rhs, arr1))
def test_string_index(): from cudf.core.index import StringIndex pdf = pd.DataFrame(np.random.rand(5, 5)) gdf = cudf.from_pandas(pdf) stringIndex = ["a", "b", "c", "d", "e"] pdf.index = stringIndex gdf.index = stringIndex assert_eq(pdf, gdf) stringIndex = np.array(["a", "b", "c", "d", "e"]) pdf.index = stringIndex gdf.index = stringIndex assert_eq(pdf, gdf) stringIndex = StringIndex(["a", "b", "c", "d", "e"], name="name") pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) stringIndex = as_index(as_column(["a", "b", "c", "d", "e"]), name="name") pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf)
def _getitem_tuple_arg(self, arg): from uuid import uuid4 from cudf import MultiIndex from cudf.core.column import column from cudf.core.dataframe import DataFrame from cudf.core.index import as_index # Step 1: Gather columns if isinstance(arg, tuple): columns_df = self._get_column_selection(arg[1]) columns_df._index = self._df._index else: columns_df = self._df # Step 2: Gather rows if isinstance(columns_df.index, MultiIndex): if isinstance(arg, (MultiIndex, pd.MultiIndex)): if isinstance(arg, pd.MultiIndex): arg = MultiIndex.from_pandas(arg) indices = indices_from_labels(columns_df, arg) return columns_df.take(indices) else: if isinstance(arg, tuple): return columns_df.index._get_row_major(columns_df, arg[0]) else: return columns_df.index._get_row_major(columns_df, arg) else: if isinstance(arg[0], slice): out = get_label_range_or_mask( columns_df.index, arg[0].start, arg[0].stop, arg[0].step ) if isinstance(out, slice): df = columns_df._slice(out) else: df = columns_df._apply_boolean_mask(out) else: tmp_arg = arg if is_scalar(arg[0]): # If a scalar, there is possibility of having duplicates. # Join would get all the duplicates. So, coverting it to # an array kind. tmp_arg = ([tmp_arg[0]], tmp_arg[1]) if len(tmp_arg[0]) == 0: return columns_df._empty_like(keep_index=True) tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1]) if pd.api.types.is_bool_dtype(tmp_arg[0]): df = columns_df._apply_boolean_mask(tmp_arg[0]) else: tmp_col_name = str(uuid4()) other_df = DataFrame( {tmp_col_name: column.arange(len(tmp_arg[0]))}, index=as_index(tmp_arg[0]), ) df = other_df.join(columns_df, how="inner") # as join is not assigning any names to index, # update it over here df.index.name = columns_df.index.name df = df.sort_values(tmp_col_name) df.drop(columns=[tmp_col_name], inplace=True) # There were no indices found if len(df) == 0: raise KeyError(arg) # Step 3: Gather index if df.shape[0] == 1: # we have a single row if isinstance(arg[0], slice): start = arg[0].start if start is None: start = self._df.index[0] df.index = as_index(start) else: row_selection = column.as_column(arg[0]) if pd.api.types.is_bool_dtype(row_selection.dtype): df.index = self._df.index.take(row_selection) else: df.index = as_index(row_selection) # Step 4: Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df
def categories(self): from cudf.core.index import as_index return as_index(self._parent.categories)
def categories(self): from cudf.core.index import as_index return as_index(self._column.categories)
def __init__(self, levels=None, codes=None, labels=None, names=None, **kwargs): from cudf.core.series import Series self.name = None self.names = names self._source_data = None column_names = [] if labels: warnings.warn( "the 'labels' keyword is deprecated, use 'codes' " "instead", FutureWarning, ) if labels and not codes: codes = labels # early termination enables lazy evaluation of codes if "source_data" in kwargs: self._source_data = kwargs["source_data"].reset_index(drop=True) self._codes = codes self._levels = levels return # name setup if isinstance( names, ( Sequence, pd.core.indexes.frozen.FrozenNDArray, pd.core.indexes.frozen.FrozenList, ), ): if sum(x is None for x in names) > 1: column_names = list(range(len(codes))) else: column_names = names elif names is None: column_names = list(range(len(codes))) else: column_names = names if len(levels) == 0: raise ValueError("Must pass non-zero number of levels/codes") from cudf import DataFrame if not isinstance(codes, DataFrame) and not isinstance( codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)): raise TypeError("Codes is not a Sequence of sequences") if isinstance(codes, DataFrame): self._codes = codes elif len(levels) == len(codes): self._codes = DataFrame() for i, codes in enumerate(codes): name = column_names[i] or i codes = column.as_column(codes) self._codes[name] = codes.astype(np.int64) else: raise ValueError("MultiIndex has unequal number of levels and " "codes and is inconsistent!") self._levels = [Series(level) for level in levels] self._validate_levels_and_codes(self._levels, self._codes) self._source_data = DataFrame() for i, name in enumerate(self._codes.columns): codes = as_index(self._codes[name]._column) if -1 in self._codes[name].values: # Must account for null(s) in _source_data column level = DataFrame( {name: [None] + list(self._levels[i])}, index=range(-1, len(self._levels[i])), ) else: level = DataFrame({name: self._levels[i]}) level = DataFrame(index=codes).join(level) self._source_data[name] = level[name].reset_index(drop=True) self.names = [None] * len(self._levels) if names is None else names
def to_datetime( arg, errors="raise", dayfirst=False, yearfirst=False, utc=None, format=None, exact=True, unit="ns", infer_datetime_format=False, origin="unix", cache=True, ): """ Convert argument to datetime. Parameters ---------- arg : int, float, str, datetime, list, tuple, 1-d array, Series DataFrame/dict-like The object to convert to a datetime. errors : {'ignore', 'raise', 'coerce', 'warn'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaT. - If 'warn' : prints last exceptions as warnings and return the input. - If 'ignore', then invalid parsing will return the input. dayfirst : bool, default False Specify a date parse order if `arg` is str or its list-likes. If True, parses dates with the day first, eg 10/11/12 is parsed as 2012-11-10. Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug, based on dateutil behavior). format : str, default None The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. See strftime documentation for more information on choices: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. unit : str, default 'ns' The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin(unix epoch start). Example, with unit='ms' and origin='unix' (the default), this would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : bool, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. Returns ------- datetime If parsing succeeded. Return type depends on input: - list-like: DatetimeIndex - Series: Series of datetime64 dtype - scalar: Timestamp Examples -------- Assembling a datetime from multiple columns of a DataFrame. The keys can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same >>> import cudf >>> df = cudf.DataFrame({'year': [2015, 2016], ... 'month': [2, 3], ... 'day': [4, 5]}) >>> cudf.to_datetime(df) 0 2015-02-04 1 2016-03-05 dtype: datetime64[ns] >>> cudf.to_datetime(1490195805, unit='s') numpy.datetime64('2017-03-22T15:16:45.000000000') >>> cudf.to_datetime(1490195805433502912, unit='ns') numpy.datetime64('1780-11-20T01:02:30.494253056') """ if arg is None: return None if exact is False: raise NotImplementedError("exact support is not yet implemented") if origin != "unix": raise NotImplementedError("origin support is not yet implemented") if yearfirst: raise NotImplementedError("yearfirst support is not yet implemented") try: if isinstance(arg, cudf.DataFrame): # we require at least Ymd required = ["year", "month", "day"] req = list(set(required) - set(arg._data.names)) if len(req): req = ",".join(req) raise ValueError( f"to assemble mappings requires at least that " f"[year, month, day] be specified: [{req}] " f"is missing" ) # replace passed column name with values in _unit_map unit = {k: get_units(k) for k in arg._data.names} unit_rev = {v: k for k, v in unit.items()} # keys we don't recognize excess = set(unit_rev.keys()) - set(_unit_map.values()) if len(excess): excess = ",".join(excess) raise ValueError( f"extra keys have been passed to the " f"datetime assemblage: [{excess}]" ) new_series = ( arg[unit_rev["year"]].astype("str") + "-" + arg[unit_rev["month"]].astype("str").str.zfill(2) + "-" + arg[unit_rev["day"]].astype("str").str.zfill(2) ) format = "%Y-%m-%d" col = new_series._column.as_datetime_column( "datetime64[s]", format=format ) for u in ["h", "m", "s", "ms", "us", "ns"]: value = unit_rev.get(u) if value is not None and value in arg: arg_col = arg._data[value] if arg_col.dtype.kind in ("f"): col = new_series._column.as_datetime_column( "datetime64[ns]", format=format ) break elif arg_col.dtype.kind in ("O"): if not cpp_is_integer(arg_col).all(): col = new_series._column.as_datetime_column( "datetime64[ns]", format=format ) break times_column = None for u in ["h", "m", "s", "ms", "us", "ns"]: value = unit_rev.get(u) if value is not None and value in arg: current_col = arg._data[value] # If the arg[value] is of int or # float dtype we don't want to type-cast if current_col.dtype.kind in ("O"): try: current_col = current_col.astype(dtype="int64") except ValueError: current_col = current_col.astype(dtype="float64") factor = as_device_scalar( column.datetime._numpy_to_pandas_conversion[u] / ( column.datetime._numpy_to_pandas_conversion["s"] if np.datetime_data(col.dtype)[0] == "s" else 1 ) ) if times_column is None: times_column = current_col * factor else: times_column = times_column + (current_col * factor) if times_column is not None: col = (col.astype(dtype="int64") + times_column).astype( dtype=col.dtype ) return cudf.Series(col, index=arg.index) elif isinstance(arg, cudf.Index): col = arg._values col = _process_col( col=col, unit=unit, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format, format=format, ) return as_index(col, name=arg.name) elif isinstance(arg, cudf.Series): col = arg._column col = _process_col( col=col, unit=unit, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format, format=format, ) return cudf.Series(col, index=arg.index, name=arg.name) else: col = column.as_column(arg) col = _process_col( col=col, unit=unit, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format, format=format, ) if is_scalar(arg): return col[0] else: return as_index(col) except Exception as e: if errors == "raise": raise e elif errors == "warn": import traceback tb = traceback.format_exc() warnings.warn(tb) elif errors == "ignore": pass elif errors == "coerce": return np.datetime64("nat", "ns" if unit is None else unit) return arg
def __new__( cls, levels=None, codes=None, sortorder=None, labels=None, names=None, dtype=None, copy=False, name=None, **kwargs, ) -> "MultiIndex": if sortorder is not None: raise NotImplementedError("sortorder is not yet supported") if name is not None: raise NotImplementedError( "Use `names`, `name` is not yet supported" ) out = Frame.__new__(cls) super(Index, out).__init__() if copy: if isinstance(codes, cudf.DataFrame): codes = codes.copy() if len(levels) > 0 and isinstance(levels[0], cudf.Series): levels = [level.copy() for level in levels] out._name = None column_names = [] if labels: warnings.warn( "the 'labels' keyword is deprecated, use 'codes' " "instead", FutureWarning, ) if labels and not codes: codes = labels # early termination enables lazy evaluation of codes if "source_data" in kwargs: source_data = kwargs["source_data"].copy(deep=False) source_data.reset_index(drop=True, inplace=True) if isinstance(source_data, pd.DataFrame): nan_as_null = kwargs.get("nan_as_null", None) source_data = cudf.DataFrame.from_pandas( source_data, nan_as_null=nan_as_null ) names = names if names is not None else source_data._data.names # if names are unique # try using those as the source_data column names: if len(dict.fromkeys(names)) == len(names): source_data.columns = names out._data = source_data._data out.names = names out._codes = codes out._levels = levels return out # name setup if isinstance(names, (Sequence, pd.core.indexes.frozen.FrozenList,),): if sum(x is None for x in names) > 1: column_names = list(range(len(codes))) else: column_names = names elif names is None: column_names = list(range(len(codes))) else: column_names = names if len(levels) == 0: raise ValueError("Must pass non-zero number of levels/codes") if not isinstance(codes, cudf.DataFrame) and not isinstance( codes[0], (Sequence, np.ndarray) ): raise TypeError("Codes is not a Sequence of sequences") if isinstance(codes, cudf.DataFrame): out._codes = codes elif len(levels) == len(codes): out._codes = cudf.DataFrame() for i, codes in enumerate(codes): name = column_names[i] or i codes = column.as_column(codes) out._codes[name] = codes.astype(np.int64) else: raise ValueError( "MultiIndex has unequal number of levels and " "codes and is inconsistent!" ) out._levels = [cudf.Series(level) for level in levels] out._validate_levels_and_codes(out._levels, out._codes) source_data = cudf.DataFrame() for i, name in enumerate(out._codes.columns): codes = as_index(out._codes[name]._column) if -1 in out._codes[name].values: # Must account for null(s) in _source_data column level = cudf.DataFrame( {name: [None] + list(out._levels[i])}, index=range(-1, len(out._levels[i])), ) else: level = cudf.DataFrame({name: out._levels[i]}) source_data[name] = libcudf.copying.gather( level, codes._data.columns[0] )._data[name] out._data = source_data._data out.names = names return out