def _cov_corr(self, name: str) -> DataFrame: calc_columns: List[str] = [] calc_dtype_loc: List[Tuple[str, int]] = [] np_dtype = 'int64' for col in self._df._columns: if col in self._group_columns: continue dtype, loc, order = self._df._column_info[col].values if dtype in 'fib': if dtype == 'f': np_dtype = 'float64' calc_columns.append(col) calc_dtype_loc.append((dtype, loc)) data = self._df._values_number_drop(calc_columns, calc_dtype_loc, np_dtype) dtype_word = utils.convert_kind_to_dtype(data.dtype.kind) func = getattr(_gb, name + '_' + dtype_word) result = func(self._group_labels, len(self), data, []) data_dict = self._get_group_col_data() data_dict_final: Dict[str, List[ndarray]] = defaultdict(list) for dtype, arrs in data_dict.items(): data_dict_final[dtype] = [ np.repeat(arrs[0], len(calc_columns), axis=0) ] new_column_info = self._get_new_column_info() num_group_cols = len(self._group_columns) new_columns = self._group_columns.copy() cur_obj_loc = utils.get_num_cols(data_dict_final.get('O', [])) column_name_array = np.tile(calc_columns, len(self))[:, np.newaxis] data_dict_final['O'].append(column_name_array) new_columns.append('Column Name') new_column_info['Column Name'] = utils.Column('O', cur_obj_loc, num_group_cols) cur_loc = utils.get_num_cols(data_dict_final.get('f', [])) for i, col in enumerate(calc_columns): new_column_info[col] = utils.Column('f', i + cur_loc, i + num_group_cols + 1) new_columns.append(col) data_dict_final['f'].append(result) new_data = utils.concat_stat_arrays(data_dict_final) return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
def _roll_agg(self, agg_cols: Dict = None, new_names: Dict = None, new_order: Dict = None, num_agg_cols: int = None, func_kwargs: Dict = None): col_info = self._df._column_info kept_dtype_loc = defaultdict(list) new_column_info = {} dtype_ct = defaultdict(int) for i, col in enumerate(self._kept_columns): dtype, loc, _ = col_info[col].values new_loc = len(kept_dtype_loc[dtype]) kept_dtype_loc[dtype].append(loc) new_column_info[col] = utils.Column(dtype, new_loc, i) dtype_ct[dtype] += 1 data_dict = defaultdict(list) new_columns = self._kept_columns.copy() + [''] * num_agg_cols for name, agg_cols in agg_cols.items(): agg_dtype_locs = defaultdict(list) agg_dtype_names = defaultdict(list) agg_dtype_new_names = defaultdict(list) agg_dtype_order = defaultdict(list) non_agg_dtype_locs = defaultdict(list) agg_dtype_kwargs = defaultdict(list) if isinstance(name, str): # name can also be a custom function name_kwargs = get_func_kwargs(name) ignore_str = name_kwargs.get('ignore_str', True) ignore_date = name_kwargs.get('ignore_date', True) keep_date_type = name_kwargs.get('keep_date_type', True) else: ignore_str = False ignore_date = False keep_date_type = True cur_new_names = new_names[name] cur_new_order = new_order[name] kwargs_list = func_kwargs[name] for col in self._df._columns: dtype, loc, _ = self._df._column_info[col].values try: idx = agg_cols.index(col) except ValueError: non_agg_dtype_locs[dtype].append(loc) else: agg_dtype_locs[dtype].append(loc) agg_dtype_names[dtype].append(col) agg_dtype_new_names[dtype].append(cur_new_names[idx]) agg_dtype_order[dtype].append(cur_new_order[idx]) agg_dtype_kwargs[dtype].append(kwargs_list[idx]) for dtype, data in self._df._data.items(): if dtype not in agg_dtype_locs: continue if ignore_str and dtype == 'O': continue if ignore_date and dtype in 'mM': continue if dtype in 'mM': data = data.view('int64') kwargs = {} for kw in agg_dtype_kwargs[dtype]: if kw is not None: kwargs = kw break if isinstance(name, str): func_name = name + '_' + utils.convert_kind_to_dtype_generic( dtype) else: func_name = 'custom_' + utils.convert_kind_to_dtype_generic( dtype) # 'name' is actually a function here kwargs['func'] = name kwargs['col_dict'] = dict( zip(agg_dtype_locs[dtype], agg_dtype_names[dtype])) func = getattr(_roll, func_name) arr = func(data, np.array(agg_dtype_locs[dtype]), self._left, self._right, self._min_window, **kwargs) if dtype in 'mM' and keep_date_type: new_kind = dtype arr = arr.astype(utils.convert_kind_to_dtype(dtype)) else: new_kind = arr.dtype.kind cur_loc = utils.get_num_cols(data_dict.get( new_kind, [])) + dtype_ct[new_kind] data_dict[new_kind].append(arr) old_locs = agg_dtype_locs[dtype] order = np.argsort(old_locs).tolist() cur_names = np.array(agg_dtype_new_names[dtype])[order] cur_order = len(self._kept_columns) + np.array( agg_dtype_order[dtype])[order] for i, cur_name in enumerate(cur_names): new_column_info[cur_name] = utils.Column( new_kind, cur_loc + i, cur_order[i]) new_columns[cur_order[i]] = cur_name new_data = {} for dtype, locs in kept_dtype_loc.items(): data = self._df._data[dtype][:, locs] if data.ndim == 1: data = data[:, np.newaxis] new_data[dtype] = data for dtype, data in data_dict.items(): if dtype not in new_data: new_data[dtype] = np.column_stack((*data, )) else: new_data[dtype] = np.column_stack((new_data[dtype], *data)) return DataFrame._construct_from_new( new_data, new_column_info, np.asarray(new_columns, dtype='O'))
def _single_agg(self, agg_cols: Dict = None, new_names: Dict = None, new_order: Dict = None, num_agg_cols: int = None, func_kwargs: Dict = None) -> DataFrame: labels = self._group_labels size = len(self._group_position) data_dict = self._get_group_col_data() new_column_info = self._get_new_column_info() new_columns = self._group_columns.copy() + [''] * num_agg_cols for name, agg_cols in agg_cols.items(): agg_dtype_locs = defaultdict(list) agg_dtype_names = defaultdict(list) agg_dtype_new_names = defaultdict(list) agg_dtype_order = defaultdict(list) non_agg_dtype_locs = defaultdict(list) agg_dtype_kwargs = defaultdict(list) if isinstance(name, str): # name can also be a custom function name_kwargs = get_func_kwargs(name) ignore_str = name_kwargs.get('ignore_str', True) add_positions = name_kwargs.get('add_positions', False) ignore_date = name_kwargs.get('ignore_date', True) keep_date_type = name_kwargs.get('keep_date_type', True) else: ignore_str = False add_positions = False ignore_date = False keep_date_type = True cur_new_names = new_names[name] cur_new_order = new_order[name] kwargs_list = func_kwargs[name] for col in self._df._columns: dtype, loc, _ = self._df._column_info[col].values try: idx = agg_cols.index(col) except ValueError: non_agg_dtype_locs[dtype].append(loc) else: agg_dtype_locs[dtype].append(loc) agg_dtype_names[dtype].append(col) agg_dtype_new_names[dtype].append(cur_new_names[idx]) agg_dtype_order[dtype].append(cur_new_order[idx]) agg_dtype_kwargs[dtype].append(kwargs_list[idx]) for dtype, data in self._df._data.items(): if dtype not in agg_dtype_locs: continue if ignore_str and dtype == 'O': continue if ignore_date and dtype in 'mM': continue if dtype in 'mM': data = data.view('int64') kwargs = {} for kw in agg_dtype_kwargs[dtype]: if kw is not None: kwargs = kw break if isinstance(name, str): func_name = name + '_' + utils.convert_kind_to_dtype_generic( dtype) else: func_name = 'custom_' + utils.convert_kind_to_dtype_generic( dtype) # 'name' is actually a function here kwargs['func'] = name kwargs['col_dict'] = dict( zip(agg_dtype_locs[dtype], agg_dtype_names[dtype])) func = getattr(_gb, func_name) if add_positions: arr = func(labels, size, data, non_agg_dtype_locs[dtype], self._group_position, **kwargs) else: arr = func(labels, size, data, non_agg_dtype_locs[dtype], **kwargs) if dtype in 'mM' and keep_date_type: new_kind = dtype arr = arr.astype(utils.convert_kind_to_dtype(dtype)) else: new_kind = arr.dtype.kind cur_loc = utils.get_num_cols(data_dict.get(new_kind, [])) data_dict[new_kind].append(arr) old_locs = agg_dtype_locs[dtype] order = np.argsort(old_locs).tolist() cur_names = np.array(agg_dtype_new_names[dtype])[order] cur_order = len(self._group_columns) + np.array( agg_dtype_order[dtype])[order] for i, cur_name in enumerate(cur_names): new_column_info[cur_name] = utils.Column( new_kind, cur_loc + i, cur_order[i]) new_columns[cur_order[i]] = cur_name new_data = utils.concat_stat_arrays(data_dict) new_columns = np.array(new_columns, dtype='O') return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
def _group_agg(self, name: str, ignore_str: bool = True, add_positions: bool = False, keep_group_cols: bool = True, ignore_date: bool = True, keep_date_type: bool = True, **kwargs) -> DataFrame: labels = self._group_labels size = len(self._group_position) old_dtype_col: Dict[str, List[str]] = defaultdict(list) for col, col_obj in self._df._column_info.items(): if col not in self._group_columns: old_dtype_col[col_obj.dtype].append(col) if keep_group_cols: data_dict = self._get_group_col_data() new_column_info = self._get_new_column_info() new_columns = self._group_columns.copy() else: data_dict = defaultdict(list) new_column_info = {} new_columns = [] for dtype, data in self._df._data.items(): if ignore_str and dtype == 'O': continue if ignore_date and dtype in 'mM': continue # number of grouped columns group_locs: list = self._group_dtype_loc.get(dtype, []) if len(group_locs) != data.shape[1]: func_name = name + '_' + utils.convert_kind_to_dtype_generic( dtype) func = getattr(_gb, func_name) if dtype in 'mM': data = data.view('int64') if add_positions: arr = func(labels, size, data, group_locs, self._group_position, **kwargs) else: arr = func(labels, size, data, group_locs, **kwargs) else: continue if dtype in 'mM' and keep_date_type: new_kind = dtype arr = arr.astype(utils.convert_kind_to_dtype(dtype)) else: new_kind = arr.dtype.kind cur_loc = utils.get_num_cols(data_dict.get(new_kind, [])) data_dict[new_kind].append(arr) for col in old_dtype_col[dtype]: count_less = 0 old_kind, old_loc, old_order = self._df._column_info[ col].values for k in self._group_dtype_loc.get(dtype, []): count_less += old_loc > k new_column_info[col] = utils.Column( new_kind, cur_loc + old_loc - count_less, 0) i = len(new_columns) j = 0 for col in self._df._columns: if col not in new_column_info: continue if col in self._group_columns and keep_group_cols: new_column_info[col].order = j j += 1 continue new_columns.append(col) new_column_info[col].order = i i += 1 new_data = utils.concat_stat_arrays(data_dict) return DataFrame._construct_from_new(new_data, new_column_info, new_columns)