示例#1
0
def assert_frame_equal(df1: DataFrame, df2: DataFrame) -> None:
    if df1.shape != df2.shape:
        raise AssertionError('DataFrame shapes are not equal, '
                             f'{df1.shape} != {df2.shape}')

    for i, col in enumerate(df1.columns):
        if df2.columns[i] != col:
            raise AssertionError(
                f'column number {i} in left DataFrame not equal to right '
                f'{col} != {df2.columns[i]}')

        kind1, loc1 = df1._get_col_dtype_loc(col)  # type: str, int
        arr1: ndarray = df1._data[kind1][:, loc1]

        kind2, loc2 = df2._get_col_dtype_loc(col)  # type: str, int
        arr2: ndarray = df2._data[kind2][:, loc2]

        if kind1 != kind2:
            dtype1 = utils.convert_kind_to_dtype(kind1)
            dtype2 = utils.convert_kind_to_dtype(kind2)
            raise AssertionError(f'The data types of column {col} are not '
                                 f'equal. {dtype1} != {dtype2}')

        if kind1 == 'S':
            srm1 = df1._str_reverse_map[loc1]
            srm2 = df2._str_reverse_map[loc2]
            if not va.is_equal_str_cat_array(arr1, arr2, srm1, srm2):
                raise AssertionError(
                    f'The values of column {col} are not equal')
        elif not _check_1d_arrays(arr1, arr2, kind1):
            raise AssertionError(f'The values of column {col} are not equal')
示例#2
0
def assert_frame_equal(df1: DataFrame, df2: DataFrame) -> None:
    if df1.shape != df2.shape:
        raise AssertionError('DataFrame shapes are not equal, '
                             f'{df1.shape} != {df2.shape}')

    for i, col in enumerate(df1.columns):
        if df2.columns[i] != col:
            raise AssertionError(f'column number {i} in left DataFrame not equal to right '
                                 f'{col} != {df2.columns[i]}')

        kind1: str
        loc1: int
        kind2: str
        loc2: int
        arr1: ndarray
        arr2: ndarray

        kind1, loc1, _ = df1._column_info[col].values
        arr1 = df1._data[kind1][:, loc1]

        kind2, loc2, order = df2._column_info[col].values
        arr2 = df2._data[kind2][:, loc2]

        if kind1 != kind2:
            dtype1 = utils.convert_kind_to_dtype(kind1)
            dtype2 = utils.convert_kind_to_dtype(kind2)
            raise AssertionError(f'The data types of column {col} are not '
                                 f'equal. {dtype1} != {dtype2}')

        if not _check_1d_arrays(arr1, arr2, kind1):
            raise AssertionError(f'The values of column {col} are not equal')
示例#3
0
    def _cov_corr(self, name: str) -> DataFrame:
        calc_columns: List[str] = []
        calc_dtype_loc: List[Tuple[str, int]] = []
        np_dtype = 'int64'
        for col in self._df._columns:
            if col in self._group_columns:
                continue
            dtype, loc, order = self._df._column_info[col].values
            if dtype in 'fib':
                if dtype == 'f':
                    np_dtype = 'float64'
                calc_columns.append(col)
                calc_dtype_loc.append((dtype, loc))

        data = self._df._values_number_drop(calc_columns, calc_dtype_loc,
                                            np_dtype)
        dtype_word = utils.convert_kind_to_dtype(data.dtype.kind)
        func = getattr(_gb, name + '_' + dtype_word)
        result = func(self._group_labels, len(self), data, [])

        data_dict = self._get_group_col_data()
        data_dict_final: Dict[str, List[ndarray]] = defaultdict(list)
        for dtype, arrs in data_dict.items():
            data_dict_final[dtype] = [
                np.repeat(arrs[0], len(calc_columns), axis=0)
            ]

        new_column_info = self._get_new_column_info()
        num_group_cols = len(self._group_columns)
        new_columns = self._group_columns.copy()

        cur_obj_loc = utils.get_num_cols(data_dict_final.get('O', []))
        column_name_array = np.tile(calc_columns, len(self))[:, np.newaxis]
        data_dict_final['O'].append(column_name_array)
        new_columns.append('Column Name')
        new_column_info['Column Name'] = utils.Column('O', cur_obj_loc,
                                                      num_group_cols)

        cur_loc = utils.get_num_cols(data_dict_final.get('f', []))

        for i, col in enumerate(calc_columns):
            new_column_info[col] = utils.Column('f', i + cur_loc,
                                                i + num_group_cols + 1)
            new_columns.append(col)

        data_dict_final['f'].append(result)
        new_data = utils.concat_stat_arrays(data_dict_final)

        return DataFrame._construct_from_new(new_data, new_column_info,
                                             new_columns)
示例#4
0
    def _roll_agg(self,
                  agg_cols: Dict = None,
                  new_names: Dict = None,
                  new_order: Dict = None,
                  num_agg_cols: int = None,
                  func_kwargs: Dict = None):

        col_info = self._df._column_info
        kept_dtype_loc = defaultdict(list)
        new_column_info = {}
        dtype_ct = defaultdict(int)
        for i, col in enumerate(self._kept_columns):
            dtype, loc, _ = col_info[col].values
            new_loc = len(kept_dtype_loc[dtype])
            kept_dtype_loc[dtype].append(loc)
            new_column_info[col] = utils.Column(dtype, new_loc, i)
            dtype_ct[dtype] += 1

        data_dict = defaultdict(list)
        new_columns = self._kept_columns.copy() + [''] * num_agg_cols

        for name, agg_cols in agg_cols.items():

            agg_dtype_locs = defaultdict(list)
            agg_dtype_names = defaultdict(list)
            agg_dtype_new_names = defaultdict(list)
            agg_dtype_order = defaultdict(list)
            non_agg_dtype_locs = defaultdict(list)
            agg_dtype_kwargs = defaultdict(list)

            if isinstance(name, str):
                # name can also be a custom function
                name_kwargs = get_func_kwargs(name)
                ignore_str = name_kwargs.get('ignore_str', True)
                ignore_date = name_kwargs.get('ignore_date', True)
                keep_date_type = name_kwargs.get('keep_date_type', True)
            else:
                ignore_str = False
                ignore_date = False
                keep_date_type = True

            cur_new_names = new_names[name]
            cur_new_order = new_order[name]
            kwargs_list = func_kwargs[name]

            for col in self._df._columns:

                dtype, loc, _ = self._df._column_info[col].values
                try:
                    idx = agg_cols.index(col)
                except ValueError:
                    non_agg_dtype_locs[dtype].append(loc)
                else:
                    agg_dtype_locs[dtype].append(loc)
                    agg_dtype_names[dtype].append(col)
                    agg_dtype_new_names[dtype].append(cur_new_names[idx])
                    agg_dtype_order[dtype].append(cur_new_order[idx])
                    agg_dtype_kwargs[dtype].append(kwargs_list[idx])

            for dtype, data in self._df._data.items():
                if dtype not in agg_dtype_locs:
                    continue
                if ignore_str and dtype == 'O':
                    continue
                if ignore_date and dtype in 'mM':
                    continue

                if dtype in 'mM':
                    data = data.view('int64')

                kwargs = {}
                for kw in agg_dtype_kwargs[dtype]:
                    if kw is not None:
                        kwargs = kw
                        break

                if isinstance(name, str):
                    func_name = name + '_' + utils.convert_kind_to_dtype_generic(
                        dtype)
                else:
                    func_name = 'custom_' + utils.convert_kind_to_dtype_generic(
                        dtype)
                    # 'name' is actually a function here
                    kwargs['func'] = name
                    kwargs['col_dict'] = dict(
                        zip(agg_dtype_locs[dtype], agg_dtype_names[dtype]))

                func = getattr(_roll, func_name)

                arr = func(data, np.array(agg_dtype_locs[dtype]), self._left,
                           self._right, self._min_window, **kwargs)

                if dtype in 'mM' and keep_date_type:
                    new_kind = dtype
                    arr = arr.astype(utils.convert_kind_to_dtype(dtype))
                else:
                    new_kind = arr.dtype.kind

                cur_loc = utils.get_num_cols(data_dict.get(
                    new_kind, [])) + dtype_ct[new_kind]
                data_dict[new_kind].append(arr)

                old_locs = agg_dtype_locs[dtype]
                order = np.argsort(old_locs).tolist()

                cur_names = np.array(agg_dtype_new_names[dtype])[order]
                cur_order = len(self._kept_columns) + np.array(
                    agg_dtype_order[dtype])[order]

                for i, cur_name in enumerate(cur_names):
                    new_column_info[cur_name] = utils.Column(
                        new_kind, cur_loc + i, cur_order[i])
                    new_columns[cur_order[i]] = cur_name

        new_data = {}
        for dtype, locs in kept_dtype_loc.items():
            data = self._df._data[dtype][:, locs]
            if data.ndim == 1:
                data = data[:, np.newaxis]
            new_data[dtype] = data

        for dtype, data in data_dict.items():
            if dtype not in new_data:
                new_data[dtype] = np.column_stack((*data, ))
            else:
                new_data[dtype] = np.column_stack((new_data[dtype], *data))

        return DataFrame._construct_from_new(
            new_data, new_column_info, np.asarray(new_columns, dtype='O'))
示例#5
0
    def _single_agg(self,
                    agg_cols: Dict = None,
                    new_names: Dict = None,
                    new_order: Dict = None,
                    num_agg_cols: int = None,
                    func_kwargs: Dict = None) -> DataFrame:

        labels = self._group_labels
        size = len(self._group_position)

        data_dict = self._get_group_col_data()
        new_column_info = self._get_new_column_info()
        new_columns = self._group_columns.copy() + [''] * num_agg_cols

        for name, agg_cols in agg_cols.items():

            agg_dtype_locs = defaultdict(list)
            agg_dtype_names = defaultdict(list)
            agg_dtype_new_names = defaultdict(list)
            agg_dtype_order = defaultdict(list)
            non_agg_dtype_locs = defaultdict(list)
            agg_dtype_kwargs = defaultdict(list)

            if isinstance(name, str):
                # name can also be a custom function
                name_kwargs = get_func_kwargs(name)
                ignore_str = name_kwargs.get('ignore_str', True)
                add_positions = name_kwargs.get('add_positions', False)
                ignore_date = name_kwargs.get('ignore_date', True)
                keep_date_type = name_kwargs.get('keep_date_type', True)
            else:
                ignore_str = False
                add_positions = False
                ignore_date = False
                keep_date_type = True

            cur_new_names = new_names[name]
            cur_new_order = new_order[name]
            kwargs_list = func_kwargs[name]

            for col in self._df._columns:

                dtype, loc, _ = self._df._column_info[col].values
                try:
                    idx = agg_cols.index(col)
                except ValueError:
                    non_agg_dtype_locs[dtype].append(loc)
                else:
                    agg_dtype_locs[dtype].append(loc)
                    agg_dtype_names[dtype].append(col)
                    agg_dtype_new_names[dtype].append(cur_new_names[idx])
                    agg_dtype_order[dtype].append(cur_new_order[idx])
                    agg_dtype_kwargs[dtype].append(kwargs_list[idx])

            for dtype, data in self._df._data.items():
                if dtype not in agg_dtype_locs:
                    continue
                if ignore_str and dtype == 'O':
                    continue
                if ignore_date and dtype in 'mM':
                    continue

                if dtype in 'mM':
                    data = data.view('int64')

                kwargs = {}
                for kw in agg_dtype_kwargs[dtype]:
                    if kw is not None:
                        kwargs = kw
                        break

                if isinstance(name, str):
                    func_name = name + '_' + utils.convert_kind_to_dtype_generic(
                        dtype)
                else:
                    func_name = 'custom_' + utils.convert_kind_to_dtype_generic(
                        dtype)
                    # 'name' is actually a function here
                    kwargs['func'] = name
                    kwargs['col_dict'] = dict(
                        zip(agg_dtype_locs[dtype], agg_dtype_names[dtype]))

                func = getattr(_gb, func_name)

                if add_positions:
                    arr = func(labels, size, data, non_agg_dtype_locs[dtype],
                               self._group_position, **kwargs)
                else:
                    arr = func(labels, size, data, non_agg_dtype_locs[dtype],
                               **kwargs)

                if dtype in 'mM' and keep_date_type:
                    new_kind = dtype
                    arr = arr.astype(utils.convert_kind_to_dtype(dtype))
                else:
                    new_kind = arr.dtype.kind

                cur_loc = utils.get_num_cols(data_dict.get(new_kind, []))
                data_dict[new_kind].append(arr)

                old_locs = agg_dtype_locs[dtype]
                order = np.argsort(old_locs).tolist()

                cur_names = np.array(agg_dtype_new_names[dtype])[order]
                cur_order = len(self._group_columns) + np.array(
                    agg_dtype_order[dtype])[order]

                for i, cur_name in enumerate(cur_names):
                    new_column_info[cur_name] = utils.Column(
                        new_kind, cur_loc + i, cur_order[i])
                    new_columns[cur_order[i]] = cur_name

        new_data = utils.concat_stat_arrays(data_dict)
        new_columns = np.array(new_columns, dtype='O')
        return DataFrame._construct_from_new(new_data, new_column_info,
                                             new_columns)
示例#6
0
    def _create_groups(
            self, columns: Union[str, List[str]]) -> Tuple[ndarray, ndarray]:
        self._group_dtype_loc: Dict[str, List[int]] = defaultdict(list)
        self._column_info: ColInfoT = {}
        for i, col in enumerate(columns):
            dtype, loc, _ = self._df._column_info[
                col].values  # type: str, int, int
            cur_loc = len(self._group_dtype_loc[dtype])
            self._group_dtype_loc[dtype].append(loc)
            self._column_info[col] = utils.Column(dtype, cur_loc, i)

        if len(columns) == 1:
            # since there is just one column, dtype is from the for-loop
            final_arr = self._df._data[dtype][:, loc]
            if dtype in 'mM':
                final_arr = final_arr.view('int64')
            dtype = final_arr.dtype.kind
            func_name = 'get_group_assignment_' + utils.convert_kind_to_dtype(
                dtype) + '_1d'
            return getattr(_gb, func_name)(final_arr)
        elif len(self._group_dtype_loc
                 ) == 1 or 'O' not in self._group_dtype_loc:
            arrs = []
            for dtype, locs in self._group_dtype_loc.items():
                arr = self._df._data[dtype][:, locs]
                if dtype in 'mM':
                    arr = arr.view('int64')
                arrs.append(arr)
            if len(arrs) == 1:
                final_arr = arrs[0]
            else:
                final_arr = np.column_stack(arrs)

            dtype = final_arr.dtype.kind
            func_name = 'get_group_assignment_' + utils.convert_kind_to_dtype(
                dtype) + '_2d'
            final_arr = np.ascontiguousarray(final_arr)
            return getattr(_gb, func_name)(final_arr)
        else:
            arrs = []
            for dtype, locs in self._group_dtype_loc.items():
                if dtype == 'O':
                    arr_str = self._df._data['O'][:, locs]
                else:
                    arr = self._df._data[dtype][:, locs]
                    if dtype in 'mM':
                        arr = arr.view('int64')
                    arrs.append(arr)
            if len(arrs) == 1:
                arr_numbers = arrs[0]
            else:
                arr_numbers = np.column_stack(arrs)

            dtype = arr_numbers.dtype.kind
            if arr_str.shape[1] == 1:
                arr_str = arr_str[:, 0]
            if arr_numbers.shape[1] == 1:
                arr_numbers = arr_numbers[:, 0]

            str_ndim = str(arr_str.ndim) + 'd_'
            num_ndim = str(arr_numbers.ndim) + 'd'
            dtype_str = utils.convert_kind_to_dtype(dtype) + '_'
            func_name = 'get_group_assignment_str_' + str_ndim + dtype_str + num_ndim
            arr_numbers = np.ascontiguousarray(arr_numbers)
            return getattr(_gb, func_name)(arr_str, arr_numbers)
示例#7
0
    def _group_agg(self,
                   name: str,
                   ignore_str: bool = True,
                   add_positions: bool = False,
                   keep_group_cols: bool = True,
                   ignore_date: bool = True,
                   keep_date_type: bool = True,
                   **kwargs) -> DataFrame:
        labels = self._group_labels
        size = len(self._group_position)

        old_dtype_col: Dict[str, List[str]] = defaultdict(list)
        for col, col_obj in self._df._column_info.items():
            if col not in self._group_columns:
                old_dtype_col[col_obj.dtype].append(col)

        if keep_group_cols:
            data_dict = self._get_group_col_data()
            new_column_info = self._get_new_column_info()
            new_columns = self._group_columns.copy()
        else:
            data_dict = defaultdict(list)
            new_column_info = {}
            new_columns = []

        for dtype, data in self._df._data.items():
            if ignore_str and dtype == 'O':
                continue
            if ignore_date and dtype in 'mM':
                continue
            # number of grouped columns
            group_locs: list = self._group_dtype_loc.get(dtype, [])
            if len(group_locs) != data.shape[1]:
                func_name = name + '_' + utils.convert_kind_to_dtype_generic(
                    dtype)
                func = getattr(_gb, func_name)
                if dtype in 'mM':
                    data = data.view('int64')

                if add_positions:
                    arr = func(labels, size, data, group_locs,
                               self._group_position, **kwargs)
                else:
                    arr = func(labels, size, data, group_locs, **kwargs)
            else:
                continue

            if dtype in 'mM' and keep_date_type:
                new_kind = dtype
                arr = arr.astype(utils.convert_kind_to_dtype(dtype))
            else:
                new_kind = arr.dtype.kind
            cur_loc = utils.get_num_cols(data_dict.get(new_kind, []))
            data_dict[new_kind].append(arr)

            for col in old_dtype_col[dtype]:
                count_less = 0
                old_kind, old_loc, old_order = self._df._column_info[
                    col].values
                for k in self._group_dtype_loc.get(dtype, []):
                    count_less += old_loc > k

                new_column_info[col] = utils.Column(
                    new_kind, cur_loc + old_loc - count_less, 0)

        i = len(new_columns)
        j = 0
        for col in self._df._columns:
            if col not in new_column_info:
                continue
            if col in self._group_columns and keep_group_cols:
                new_column_info[col].order = j
                j += 1
                continue

            new_columns.append(col)
            new_column_info[col].order = i
            i += 1

        new_data = utils.concat_stat_arrays(data_dict)

        return DataFrame._construct_from_new(new_data, new_column_info,
                                             new_columns)