def assert_frame_equal(df1: DataFrame, df2: DataFrame) -> None: if df1.shape != df2.shape: raise AssertionError('DataFrame shapes are not equal, ' f'{df1.shape} != {df2.shape}') for i, col in enumerate(df1.columns): if df2.columns[i] != col: raise AssertionError( f'column number {i} in left DataFrame not equal to right ' f'{col} != {df2.columns[i]}') kind1, loc1 = df1._get_col_dtype_loc(col) # type: str, int arr1: ndarray = df1._data[kind1][:, loc1] kind2, loc2 = df2._get_col_dtype_loc(col) # type: str, int arr2: ndarray = df2._data[kind2][:, loc2] if kind1 != kind2: dtype1 = utils.convert_kind_to_dtype(kind1) dtype2 = utils.convert_kind_to_dtype(kind2) raise AssertionError(f'The data types of column {col} are not ' f'equal. {dtype1} != {dtype2}') if kind1 == 'S': srm1 = df1._str_reverse_map[loc1] srm2 = df2._str_reverse_map[loc2] if not va.is_equal_str_cat_array(arr1, arr2, srm1, srm2): raise AssertionError( f'The values of column {col} are not equal') elif not _check_1d_arrays(arr1, arr2, kind1): raise AssertionError(f'The values of column {col} are not equal')
def size(self): name = self._get_agg_name('size') new_columns = np.array(self._group_columns + [name], dtype='O') size = _gb.size(self._group_labels, len(self._group_position))[:, np.newaxis] data_dict = self._get_group_col_data() data_dict['i'].append(size) new_data = utils.concat_stat_arrays(data_dict) new_column_info = self._get_new_column_info() new_column_info[name] = utils.Column('i', new_data['i'].shape[1] - 1, len(new_columns) - 1) return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
def apply(self, func, *args, **kwargs): if not isinstance(func, Callable): raise TypeError( 'The `func` variable must be a function or any callable object' ) labels = self._group_labels size = len(self._group_position) new_data, new_column_info, new_columns, group_repeats = _gb.apply( labels, size, self._df, func, *args, **kwargs) grouped_data_dict = self._get_group_col_data() grouped_column_info = self._get_new_column_info() grouped_columns = self._group_columns.copy() order_add = len(grouped_columns) new_column_info_final = {} for col in new_columns: dtype, loc, order = new_column_info[col].values loc_add = grouped_data_dict.get(dtype, 0) if loc_add != 0: loc_add = loc_add[0].shape[1] new_column_info_final[col] = utils.Column(dtype, loc + loc_add, order + order_add) new_grouped_columns = [] for col in grouped_columns: if col in new_column_info_final: new_grouped_columns.append(col + '_group') else: new_grouped_columns.append(col) dtype_loc = defaultdict(int) for i, col in enumerate(grouped_columns): dtype = grouped_column_info[col].dtype loc = dtype_loc[dtype] new_col = new_grouped_columns[i] new_column_info_final[new_col] = utils.Column(dtype, loc, i) dtype_loc[dtype] += 1 new_columns = np.concatenate((new_grouped_columns, new_columns)) for dtype, data_list in grouped_data_dict.items(): data = np.concatenate(data_list, 1) data = np.repeat(data, group_repeats, axis=0) if dtype not in new_data: new_data[dtype] = data else: new_data[dtype] = np.concatenate((data, new_data[dtype]), 1) return DataFrame._construct_from_new(new_data, new_column_info_final, new_columns)
def _cov_corr(self, name: str) -> DataFrame: calc_columns: List[str] = [] calc_dtype_loc: List[Tuple[str, int]] = [] np_dtype = 'int64' for col in self._df._columns: if col in self._group_columns: continue dtype, loc, order = self._df._column_info[col].values if dtype in 'fib': if dtype == 'f': np_dtype = 'float64' calc_columns.append(col) calc_dtype_loc.append((dtype, loc)) data = self._df._values_number_drop(calc_columns, calc_dtype_loc, np_dtype) dtype_word = utils.convert_kind_to_dtype(data.dtype.kind) func = getattr(_gb, name + '_' + dtype_word) result = func(self._group_labels, len(self), data, []) data_dict = self._get_group_col_data() data_dict_final: Dict[str, List[ndarray]] = defaultdict(list) for dtype, arrs in data_dict.items(): data_dict_final[dtype] = [ np.repeat(arrs[0], len(calc_columns), axis=0) ] new_column_info = self._get_new_column_info() num_group_cols = len(self._group_columns) new_columns = self._group_columns.copy() cur_obj_loc = utils.get_num_cols(data_dict_final.get('O', [])) column_name_array = np.tile(calc_columns, len(self))[:, np.newaxis] data_dict_final['O'].append(column_name_array) new_columns.append('Column Name') new_column_info['Column Name'] = utils.Column('O', cur_obj_loc, num_group_cols) cur_loc = utils.get_num_cols(data_dict_final.get('f', [])) for i, col in enumerate(calc_columns): new_column_info[col] = utils.Column('f', i + cur_loc, i + num_group_cols + 1) new_columns.append(col) data_dict_final['f'].append(result) new_data = utils.concat_stat_arrays(data_dict_final) return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
def cumcount(self) -> DataFrame: # todo: add ascending=False name = self._get_agg_name('cumcount') new_columns = np.array(self._group_columns + [name], dtype='O') cumcount = _gb.cumcount(self._group_labels, len(self._group_position))[:, np.newaxis] data_dict = self._get_group_col_data_all() data_dict['i'].append(cumcount) new_data = utils.concat_stat_arrays(data_dict) new_column_info = self._get_new_column_info() new_column_info[name] = utils.Column('i', new_data['i'].shape[1] - 1, len(new_columns) - 1) return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
def read_csv(fp, sep=',', header=0, skiprows=None, usecols=None): if not isinstance(sep, str): raise TypeError('`sep` must be a string') if len(sep) != 1: raise ValueError('`sep` must only be one character in length') if not isinstance(header, int): raise TypeError('`header` must be an integer') if header < -1: raise ValueError('`header` must be greater than or equal to -1') if isinstance(usecols, list): if len(usecols) == 0: raise ValueError('`usecols` must be a non-empty list of integers or column names') elif usecols is not None: raise TypeError('`usecols` must be a list of integers or column names') nrows = _get_file_legnth(fp) skiprows_set = set() skiprows_int = 0 if skiprows is None: pass elif isinstance(skiprows, int): if skiprows < 0: raise ValueError('`skiprows` must be one or more non-negative integers') skiprows_int = skiprows else: skiprows_arr = np.asarray(skiprows) if (skiprows_arr < 0).any(): raise ValueError('All values in the `skiprows` sequence must be >= 0') if header == -1: skiprows_set = set(skiprows_arr) else: max_row = skiprows_arr.max() if header > max_row - len(skiprows_arr): header += len(skiprows_arr) else: max_rows = np.arange(max_row) kept_rows = max_rows[~np.isin(max_rows, skiprows_arr)] header = kept_rows[header] skiprows_set = set(skiprows_arr[skiprows_arr > header]) tuple_return = _rf.read_csv(fp, nrows, ord(sep), header, skiprows_int, skiprows_set, usecols) a_bool, a_int, a_float, a_str, columns, dtypes, dtype_loc = tuple_return new_column_info = {} dtype_map = {1: 'b', 2: 'i', 3: 'f', 4: 'O'} final_dtype_locs = defaultdict(list) for i, (col, dtype, loc) in enumerate(zip(columns, dtypes, dtype_loc)): new_column_info[col] = utils.Column(dtype_map[dtype], loc, i) final_dtype_locs[dtype_map[dtype]].append(loc) new_data = {} loc_order_changed = set() for arr, dtype in zip((a_bool, a_int, a_float, a_str), ('b', 'i', 'f', 'O')): num_cols = arr.shape[1] if num_cols != 0: locs = final_dtype_locs[dtype] if len(locs) == num_cols: new_data[dtype] = arr else: loc_order_changed.add(dtype) new_data[dtype] = arr[:, locs] if loc_order_changed: cur_dtype_loc = defaultdict(int) for col in columns: dtype, loc, order = new_column_info[col].values if dtype in loc_order_changed: new_column_info[col].loc = cur_dtype_loc[dtype] cur_dtype_loc[dtype] += 1 new_columns = np.array(columns, dtype='O') return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
def _roll_generic(self, name, columns, **kwargs): if columns is None: columns = self._df.columns elif isinstance(columns, str): columns = [columns] elif not isinstance(columns, list): raise TypeError( '`columns` must either be a string, a list of column names, or None' ) col_order = dict(zip(columns, range(len(columns)))) dtype_locs = defaultdict(list) dtype_cols = defaultdict(list) col_info = self._df._column_info for i, col in enumerate(columns): try: dtype, loc, order = col_info[col].values except KeyError: raise KeyError(f'{col} is not a column name') dtype_locs[dtype].append(loc) dtype_cols[dtype].append(col) kept_dtype_loc = defaultdict(list) new_col_info = {} dtype_ct = defaultdict(int) for i, col in enumerate(self._kept_columns): dtype, loc, _ = col_info[col].values new_loc = len(kept_dtype_loc[dtype]) kept_dtype_loc[dtype].append(loc) new_col_info[col] = utils.Column(dtype, new_loc, i) dtype_ct[dtype] += 1 data_dict = defaultdict(list) for dtype, locs in dtype_locs.items(): func_name = name + '_' + utils.convert_kind_to_dtype_generic(dtype) data = self._df._data[dtype] result = getattr(_roll, func_name)(data, np.array(locs), self._left, self._right, self._min_window, **kwargs) result_dtype = result.dtype.kind data_dict[result_dtype].append(result) for col in dtype_cols[dtype]: order = col_order[col] new_col = col if col in self._kept_columns: new_col = col + '_rolling' columns[columns.index(col)] = new_col new_col_info[new_col] = utils.Column( result_dtype, dtype_ct[result_dtype], order + len(self._kept_columns)) dtype_ct[result_dtype] += 1 new_data = {} for dtype, locs in kept_dtype_loc.items(): data = self._df._data[dtype][:, locs] if data.ndim == 1: data = data[:, np.newaxis] new_data[dtype] = data for dtype, data in data_dict.items(): if dtype not in new_data: new_data[dtype] = np.column_stack((*data, )) else: new_data[dtype] = np.column_stack((new_data[dtype], *data)) new_columns = np.concatenate((self._kept_columns, columns)) return DataFrame._construct_from_new(new_data, new_col_info, new_columns)
def _roll_agg(self, agg_cols: Dict = None, new_names: Dict = None, new_order: Dict = None, num_agg_cols: int = None, func_kwargs: Dict = None): col_info = self._df._column_info kept_dtype_loc = defaultdict(list) new_column_info = {} dtype_ct = defaultdict(int) for i, col in enumerate(self._kept_columns): dtype, loc, _ = col_info[col].values new_loc = len(kept_dtype_loc[dtype]) kept_dtype_loc[dtype].append(loc) new_column_info[col] = utils.Column(dtype, new_loc, i) dtype_ct[dtype] += 1 data_dict = defaultdict(list) new_columns = self._kept_columns.copy() + [''] * num_agg_cols for name, agg_cols in agg_cols.items(): agg_dtype_locs = defaultdict(list) agg_dtype_names = defaultdict(list) agg_dtype_new_names = defaultdict(list) agg_dtype_order = defaultdict(list) non_agg_dtype_locs = defaultdict(list) agg_dtype_kwargs = defaultdict(list) if isinstance(name, str): # name can also be a custom function name_kwargs = get_func_kwargs(name) ignore_str = name_kwargs.get('ignore_str', True) ignore_date = name_kwargs.get('ignore_date', True) keep_date_type = name_kwargs.get('keep_date_type', True) else: ignore_str = False ignore_date = False keep_date_type = True cur_new_names = new_names[name] cur_new_order = new_order[name] kwargs_list = func_kwargs[name] for col in self._df._columns: dtype, loc, _ = self._df._column_info[col].values try: idx = agg_cols.index(col) except ValueError: non_agg_dtype_locs[dtype].append(loc) else: agg_dtype_locs[dtype].append(loc) agg_dtype_names[dtype].append(col) agg_dtype_new_names[dtype].append(cur_new_names[idx]) agg_dtype_order[dtype].append(cur_new_order[idx]) agg_dtype_kwargs[dtype].append(kwargs_list[idx]) for dtype, data in self._df._data.items(): if dtype not in agg_dtype_locs: continue if ignore_str and dtype == 'O': continue if ignore_date and dtype in 'mM': continue if dtype in 'mM': data = data.view('int64') kwargs = {} for kw in agg_dtype_kwargs[dtype]: if kw is not None: kwargs = kw break if isinstance(name, str): func_name = name + '_' + utils.convert_kind_to_dtype_generic( dtype) else: func_name = 'custom_' + utils.convert_kind_to_dtype_generic( dtype) # 'name' is actually a function here kwargs['func'] = name kwargs['col_dict'] = dict( zip(agg_dtype_locs[dtype], agg_dtype_names[dtype])) func = getattr(_roll, func_name) arr = func(data, np.array(agg_dtype_locs[dtype]), self._left, self._right, self._min_window, **kwargs) if dtype in 'mM' and keep_date_type: new_kind = dtype arr = arr.astype(utils.convert_kind_to_dtype(dtype)) else: new_kind = arr.dtype.kind cur_loc = utils.get_num_cols(data_dict.get( new_kind, [])) + dtype_ct[new_kind] data_dict[new_kind].append(arr) old_locs = agg_dtype_locs[dtype] order = np.argsort(old_locs).tolist() cur_names = np.array(agg_dtype_new_names[dtype])[order] cur_order = len(self._kept_columns) + np.array( agg_dtype_order[dtype])[order] for i, cur_name in enumerate(cur_names): new_column_info[cur_name] = utils.Column( new_kind, cur_loc + i, cur_order[i]) new_columns[cur_order[i]] = cur_name new_data = {} for dtype, locs in kept_dtype_loc.items(): data = self._df._data[dtype][:, locs] if data.ndim == 1: data = data[:, np.newaxis] new_data[dtype] = data for dtype, data in data_dict.items(): if dtype not in new_data: new_data[dtype] = np.column_stack((*data, )) else: new_data[dtype] = np.column_stack((new_data[dtype], *data)) return DataFrame._construct_from_new( new_data, new_column_info, np.asarray(new_columns, dtype='O'))
def _single_agg(self, agg_cols: Dict = None, new_names: Dict = None, new_order: Dict = None, num_agg_cols: int = None, func_kwargs: Dict = None) -> DataFrame: labels = self._group_labels size = len(self._group_position) data_dict = self._get_group_col_data() new_column_info = self._get_new_column_info() new_columns = self._group_columns.copy() + [''] * num_agg_cols for name, agg_cols in agg_cols.items(): agg_dtype_locs = defaultdict(list) agg_dtype_names = defaultdict(list) agg_dtype_new_names = defaultdict(list) agg_dtype_order = defaultdict(list) non_agg_dtype_locs = defaultdict(list) agg_dtype_kwargs = defaultdict(list) if isinstance(name, str): # name can also be a custom function name_kwargs = get_func_kwargs(name) ignore_str = name_kwargs.get('ignore_str', True) add_positions = name_kwargs.get('add_positions', False) ignore_date = name_kwargs.get('ignore_date', True) keep_date_type = name_kwargs.get('keep_date_type', True) else: ignore_str = False add_positions = False ignore_date = False keep_date_type = True cur_new_names = new_names[name] cur_new_order = new_order[name] kwargs_list = func_kwargs[name] for col in self._df._columns: dtype, loc, _ = self._df._column_info[col].values try: idx = agg_cols.index(col) except ValueError: non_agg_dtype_locs[dtype].append(loc) else: agg_dtype_locs[dtype].append(loc) agg_dtype_names[dtype].append(col) agg_dtype_new_names[dtype].append(cur_new_names[idx]) agg_dtype_order[dtype].append(cur_new_order[idx]) agg_dtype_kwargs[dtype].append(kwargs_list[idx]) for dtype, data in self._df._data.items(): if dtype not in agg_dtype_locs: continue if ignore_str and dtype == 'O': continue if ignore_date and dtype in 'mM': continue if dtype in 'mM': data = data.view('int64') kwargs = {} for kw in agg_dtype_kwargs[dtype]: if kw is not None: kwargs = kw break if isinstance(name, str): func_name = name + '_' + utils.convert_kind_to_dtype_generic( dtype) else: func_name = 'custom_' + utils.convert_kind_to_dtype_generic( dtype) # 'name' is actually a function here kwargs['func'] = name kwargs['col_dict'] = dict( zip(agg_dtype_locs[dtype], agg_dtype_names[dtype])) func = getattr(_gb, func_name) if add_positions: arr = func(labels, size, data, non_agg_dtype_locs[dtype], self._group_position, **kwargs) else: arr = func(labels, size, data, non_agg_dtype_locs[dtype], **kwargs) if dtype in 'mM' and keep_date_type: new_kind = dtype arr = arr.astype(utils.convert_kind_to_dtype(dtype)) else: new_kind = arr.dtype.kind cur_loc = utils.get_num_cols(data_dict.get(new_kind, [])) data_dict[new_kind].append(arr) old_locs = agg_dtype_locs[dtype] order = np.argsort(old_locs).tolist() cur_names = np.array(agg_dtype_new_names[dtype])[order] cur_order = len(self._group_columns) + np.array( agg_dtype_order[dtype])[order] for i, cur_name in enumerate(cur_names): new_column_info[cur_name] = utils.Column( new_kind, cur_loc + i, cur_order[i]) new_columns[cur_order[i]] = cur_name new_data = utils.concat_stat_arrays(data_dict) new_columns = np.array(new_columns, dtype='O') return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
def _group_agg(self, name: str, ignore_str: bool = True, add_positions: bool = False, keep_group_cols: bool = True, ignore_date: bool = True, keep_date_type: bool = True, **kwargs) -> DataFrame: labels = self._group_labels size = len(self._group_position) old_dtype_col: Dict[str, List[str]] = defaultdict(list) for col, col_obj in self._df._column_info.items(): if col not in self._group_columns: old_dtype_col[col_obj.dtype].append(col) if keep_group_cols: data_dict = self._get_group_col_data() new_column_info = self._get_new_column_info() new_columns = self._group_columns.copy() else: data_dict = defaultdict(list) new_column_info = {} new_columns = [] for dtype, data in self._df._data.items(): if ignore_str and dtype == 'O': continue if ignore_date and dtype in 'mM': continue # number of grouped columns group_locs: list = self._group_dtype_loc.get(dtype, []) if len(group_locs) != data.shape[1]: func_name = name + '_' + utils.convert_kind_to_dtype_generic( dtype) func = getattr(_gb, func_name) if dtype in 'mM': data = data.view('int64') if add_positions: arr = func(labels, size, data, group_locs, self._group_position, **kwargs) else: arr = func(labels, size, data, group_locs, **kwargs) else: continue if dtype in 'mM' and keep_date_type: new_kind = dtype arr = arr.astype(utils.convert_kind_to_dtype(dtype)) else: new_kind = arr.dtype.kind cur_loc = utils.get_num_cols(data_dict.get(new_kind, [])) data_dict[new_kind].append(arr) for col in old_dtype_col[dtype]: count_less = 0 old_kind, old_loc, old_order = self._df._column_info[ col].values for k in self._group_dtype_loc.get(dtype, []): count_less += old_loc > k new_column_info[col] = utils.Column( new_kind, cur_loc + old_loc - count_less, 0) i = len(new_columns) j = 0 for col in self._df._columns: if col not in new_column_info: continue if col in self._group_columns and keep_group_cols: new_column_info[col].order = j j += 1 continue new_columns.append(col) new_column_info[col].order = i i += 1 new_data = utils.concat_stat_arrays(data_dict) return DataFrame._construct_from_new(new_data, new_column_info, new_columns)