def _getitem_tuple_arg(self, arg): from cudf.dataframe.dataframe import DataFrame from cudf.dataframe.index import as_index columns = self._get_column_selection(arg[1]) df = DataFrame() for col in columns: df.add_column(name=col, data=self._df[col].loc[arg[0]]) if df.shape[0] == 1: # we have a single row if isinstance(arg[0], slice): df.index = as_index(arg[0].start) else: df.index = as_index(arg[0]) return df
def apply_multiindex_or_single_index(self, result): if len(result) == 0: final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(self._by) == 1 or len(final_result.columns) == 0: dtype = 'float64' if len(self._by) == 1 else 'object' name = self._by[0] if len(self._by) == 1 else None from cudf.dataframe.index import GenericIndex index = GenericIndex(Series([], dtype=dtype)) index.name = name final_result.index = index else: mi = MultiIndex(source_data=result[self._by]) mi.names = self._by final_result.index = mi if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series([], name=final_result.columns[0]) final_series.index = final_result.index return final_series return final_result if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) idx.name = self._by[0] result = result.drop(idx.name) if idx.name == self._LEVEL_0_INDEX_NAME: idx.name = self._original_index_name result = result.set_index(idx) return result else: multi_index = MultiIndex(source_data=result[self._by]) final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series(final_result[final_result.columns[0]]) final_series.name = final_result.columns[0] final_series.index = multi_index return final_series return final_result.set_index(multi_index)
def _getitem_tuple_arg(self, arg): from cudf.dataframe.dataframe import DataFrame from cudf.dataframe.index import as_index from cudf.utils.cudautils import arange from cudf import MultiIndex # Step 1: Gather columns if isinstance(self._df.columns, MultiIndex): columns_df = self._df.columns._get_column_major(self._df, arg[1]) else: columns = self._get_column_selection(arg[1]) columns_df = DataFrame() for col in columns: columns_df.add_column(name=col, data=self._df[col]) # Step 2: Gather rows if isinstance(columns_df.index, MultiIndex): return columns_df.index._get_row_major(columns_df, arg[0]) else: if isinstance(self._df.columns, MultiIndex): if isinstance(arg[0], slice): start, stop, step = arg[0].indices(len(columns_df)) indices = arange(start, stop, step) df = columns_df.take(indices) else: df = columns_df.take(arg[0]) else: df = DataFrame() for col in columns_df.columns: df[col] = columns_df[col].loc[arg[0]] # Step 3: Gather index if df.shape[0] == 1: # we have a single row if isinstance(arg[0], slice): start = arg[0].start if start is None: start = self._df.index[0] df.index = as_index(start) else: df.index = as_index(arg[0]) # Step 4: Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df
def _getitem_tuple_arg(self, arg): from cudf import MultiIndex from cudf.dataframe.dataframe import DataFrame from cudf.dataframe.dataframe import Series from cudf.dataframe.index import as_index # Iloc Step 1: # Gather the columns specified by the second tuple arg columns = self._get_column_selection(arg[1]) if isinstance(self._df.columns, MultiIndex): columns_df = self._df.columns._get_column_major(self._df, arg[1]) if ( len(columns_df) == 0 and len(columns_df.columns) == 0 and not isinstance(arg[0], slice) ): result = Series([], name=arg[0]) result._index = columns_df.columns.copy(deep=False) return result else: if isinstance(arg[0], slice): columns_df = DataFrame() for col in columns: columns_df.add_column(name=col, data=self._df[col]) columns_df._index = self._df._index else: columns_df = self._df._columns_view(columns) # Iloc Step 2: # Gather the rows specified by the first tuple arg if isinstance(columns_df.index, MultiIndex): df = columns_df.index._get_row_major(columns_df, arg[0]) if (len(df) == 1 and len(columns_df) >= 1) and not ( isinstance(arg[0], slice) or isinstance(arg[1], slice) ): # Pandas returns a numpy scalar in this case return df[0] if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df else: df = DataFrame() for key, col in columns_df._cols.items(): df[key] = col.iloc[arg[0]] df.columns = columns_df.columns # Iloc Step 3: # Reindex if df.shape[0] == 1: # we have a single row without an index if isinstance(arg[0], slice): start = arg[0].start if start is None: start = 0 df.index = as_index(self._df.index[start]) else: df.index = as_index(self._df.index[arg[0]]) # Iloc Step 4: # Downcast if self._can_downcast_to_series(df, arg): if isinstance(df.columns, MultiIndex): if len(df) > 0 and not ( isinstance(arg[0], slice) or isinstance(arg[1], slice) ): return list(df._cols.values())[0][0] elif df.shape[1] > 1: result = self._downcast_to_series(df, arg) result.index = df.columns return result elif not isinstance(arg[0], slice): result_series = list(df._cols.values())[0] result_series.index = df.columns result_series.name = arg[0] return result_series else: return list(df._cols.values())[0] return self._downcast_to_series(df, arg) if df.shape[0] == 0 and df.shape[1] == 0: from cudf.dataframe.index import RangeIndex slice_len = arg[0].stop or len(self._df) start, stop, step = arg[0].indices(slice_len) df._index = RangeIndex(start, stop) return df
def apply_multiindex_or_single_index(self, result): if len(result) == 0: final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(self._by) == 1 or len(final_result.columns) == 0: if len(self._by) == 1: dtype = self._df[self._by[0]] else: dtype = 'object' name = self._by[0] if len(self._by) == 1 else None from cudf.dataframe.index import GenericIndex index = GenericIndex(Series([], dtype=dtype)) index.name = name final_result.index = index else: mi = MultiIndex(source_data=result[self._by]) mi.names = self._by final_result.index = mi return final_result if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) name = self._by[0] if isinstance(name, str): name = self._by[0].split('+') if name[0] == 'cudfvalcol': idx.name = name[1] else: idx.name = name[0] result = result.drop(self._by[0]) for col in result.columns: if isinstance(col, str): colnames = col.split('+') if colnames[0] == 'cudfvalcol': result[colnames[1]] = result[col] result = result.drop(col) if idx.name == _LEVEL_0_INDEX_NAME: idx.name = self._original_index_name result = result.set_index(idx) return result else: for col in result.columns: if isinstance(col, str): colnames = col.split('+') if colnames[0] == 'cudfvalcol': result[colnames[1]] = result[col] result = result.drop(col) new_by = [] for by in self._by: if isinstance(col, str): splitby = by.split('+') if splitby[0] == 'cudfvalcol': new_by.append(splitby[1]) else: new_by.append(splitby[0]) else: new_by.append(by) self._by = new_by multi_index = MultiIndex(source_data=result[self._by]) final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(final_result.columns) > 0: return final_result.set_index(multi_index) else: return result.set_index(multi_index)
def apply_multiindex_or_single_index(self, result): if len(result) == 0: final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(self._by) == 1 or len(final_result.columns) == 0: dtype = 'float64' if len(self._by) == 1 else 'object' name = self._by[0] if len(self._by) == 1 else None from cudf.dataframe.index import GenericIndex index = GenericIndex(Series([], dtype=dtype)) index.name = name final_result.index = index else: levels = [] codes = [] names = [] for by in self._by: levels.append([]) codes.append([]) names.append(by) mi = MultiIndex(levels, codes) mi.names = names final_result.index = mi if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series([], name=final_result.columns[0]) final_series.index = final_result.index return final_series return final_result if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) idx.name = self._by[0] result = result.drop(idx.name) if idx.name == self._LEVEL_0_INDEX_NAME: idx.name = self._original_index_name result = result.set_index(idx) return result else: levels = [] codes = DataFrame() names = [] # Note: This is an O(N^2) solution using gpu masking # to compute new codes for the MultiIndex. There may be # a faster solution that could be executed on gpu at the same # time the groupby is calculated. for by in self._by: level = result[by].unique() replaced = result[by].replace(level, range(len(level))) levels.append(level) codes[by] = Series(replaced, dtype="int32") names.append(by) multi_index = MultiIndex(levels=levels, codes=codes, names=names) final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series(final_result[final_result.columns[0]]) final_series.name = final_result.columns[0] final_series.index = multi_index return final_series return final_result.set_index(multi_index)