Exemplo n.º 1
0
    def _create_df_multiple_dtypes(self, arr_new, columns, column_locs, columns_other, locs_other):
        new_data = {}
        dtype_new = arr_new.dtype.kind
        try:
            add_loc = self._df._data[dtype_new].shape[1]
        except KeyError:
            add_loc = 0
        for dtype, arr in self._df._data.items():
            if dtype == self._dtype_acc:
                new_data[self._dtype_acc] = arr[:, locs_other]
            elif dtype == dtype_new:
                new_data[dtype_new] = np.asfortranarray(np.column_stack((arr, arr_new)))
            else:
                new_data[dtype] = arr.copy('F')

        if dtype_new not in new_data:
            new_data[dtype_new] = arr_new

        new_column_info = {}
        for col, old_dtype, loc, order in self._df._col_info_iter(with_order=True):  # type: str, str, int, int
            if old_dtype != self._dtype_acc:
                new_column_info[col] = utils.Column(old_dtype, loc, order)

        # str columns that have changed type
        for i, (col, loc) in enumerate(zip(columns, column_locs)):
            order = self._df._column_info[col].order
            new_column_info[col] = utils.Column(dtype_new, add_loc + i, order)

        # those that stayed self._dtype_acc
        for i, col in enumerate(columns_other):
            order = self._df._column_info[col].order
            new_column_info[col] = utils.Column(self._dtype_acc, i, order)

        return self._df._construct_from_new(new_data, new_column_info, self._df._columns.copy())
Exemplo n.º 2
0
    def _create_df_all(self, arr, dtype):
        new_data = {}
        if dtype == self._dtype_acc:
            for old_dtype, old_data in self._df._data.items():
                if old_dtype == self._dtype_acc:
                    new_data[self._dtype_acc] = arr
                else:
                    new_data[old_dtype] = old_data.copy('F')
        else:
            new_data = {}
            add_loc = 0
            if dtype in self._data:
                add_loc = self._data[dtype].shape[1]
            for old_dtype, old_data in self._df._data.items():
                if dtype != self._dtype_acc:
                    new_data[old_dtype] = old_data.copy('F')

            if dtype in new_data:
                new_data[dtype] = np.asfortranarray(
                    np.column_stack((new_data[dtype], arr)))
            else:
                new_data[dtype] = arr

            new_column_info = {}
            for col, col_obj in self._df._column_info.items():
                old_dtype, loc, order = col_obj.values
                if old_dtype == self._dtype_acc:
                    new_column_info[col] = utils.Column(
                        dtype, loc + add_loc, order)
                else:
                    new_column_info[col] = utils.Column(old_dtype, loc, order)

        new_column_info = self._df._copy_column_info()
        return self._df._construct_from_new(new_data, new_column_info,
                                            self._df._columns.copy())
Exemplo n.º 3
0
    def _create_df_all(self, arr, dtype):
        new_data = {}
        if dtype == 'S':
            for old_dtype, old_data in self._df._data.items():
                if old_dtype == 'S':
                    new_data['S'] = arr
                else:
                    new_data[old_dtype] = old_data.copy('F')
        else:
            new_data = {}
            add_loc = 0
            if dtype in self._df._data:
                add_loc = self._df._data[dtype].shape[1]
            for old_dtype, old_data in self._df._data.items():
                if dtype != 'S':
                    new_data[old_dtype] = old_data.copy('F')

            if dtype in new_data:
                new_data[dtype] = np.asfortranarray(
                    np.column_stack((new_data[dtype], arr)))
            else:
                new_data[dtype] = arr

            new_column_info = {}
            for col, old_dtype, loc, order in self._df._col_info_iter(
                    with_order=True):  # type: str, str, int, int
                if old_dtype == 'S':
                    new_column_info[col] = utils.Column(
                        dtype, loc + add_loc, order)
                else:
                    new_column_info[col] = utils.Column(old_dtype, loc, order)

        new_column_info = self._df._copy_column_info()
        return self._df._construct_from_new(new_data, new_column_info,
                                            self._df._columns.copy())
Exemplo n.º 4
0
    def apply(self, func, *args, **kwargs):
        if not isinstance(func, Callable):
            raise TypeError(
                'The `func` variable must be a function or any callable object'
            )
        labels = self._group_labels
        size = len(self._group_position)
        new_data, new_column_info, new_columns, group_repeats = _gb.apply(
            labels, size, self._df, func, *args, **kwargs)

        grouped_data_dict = self._get_group_col_data()
        grouped_column_info = self._get_new_column_info()
        grouped_columns = self._group_columns.copy()
        order_add = len(grouped_columns)

        new_column_info_final = {}
        for col in new_columns:
            dtype, loc, order = new_column_info[col].values
            loc_add = grouped_data_dict.get(dtype, 0)
            if loc_add != 0:
                loc_add = loc_add[0].shape[1]
            new_column_info_final[col] = utils.Column(dtype, loc + loc_add,
                                                      order + order_add)

        new_grouped_columns = []
        for col in grouped_columns:
            if col in new_column_info_final:
                new_grouped_columns.append(col + '_group')
            else:
                new_grouped_columns.append(col)

        dtype_loc = defaultdict(int)
        for i, col in enumerate(grouped_columns):
            dtype = grouped_column_info[col].dtype
            loc = dtype_loc[dtype]
            new_col = new_grouped_columns[i]
            new_column_info_final[new_col] = utils.Column(dtype, loc, i)
            dtype_loc[dtype] += 1

        new_columns = np.concatenate((new_grouped_columns, new_columns))

        for dtype, data_list in grouped_data_dict.items():
            data = np.concatenate(data_list, 1)
            data = np.repeat(data, group_repeats, axis=0)
            if dtype not in new_data:
                new_data[dtype] = data
            else:
                new_data[dtype] = np.concatenate((data, new_data[dtype]), 1)

        return DataFrame._construct_from_new(new_data, new_column_info_final,
                                             new_columns)
Exemplo n.º 5
0
    def _cov_corr(self, name: str) -> DataFrame:
        calc_columns: List[str] = []
        calc_dtype_loc: List[Tuple[str, int]] = []
        np_dtype = 'int64'
        for col in self._df._columns:
            if col in self._group_columns:
                continue
            dtype, loc, order = self._df._column_info[col].values
            if dtype in 'fib':
                if dtype == 'f':
                    np_dtype = 'float64'
                calc_columns.append(col)
                calc_dtype_loc.append((dtype, loc))

        data = self._df._values_number_drop(calc_columns, calc_dtype_loc,
                                            np_dtype)
        dtype_word = utils.convert_kind_to_dtype(data.dtype.kind)
        func = getattr(_gb, name + '_' + dtype_word)
        result = func(self._group_labels, len(self), data, [])

        data_dict = self._get_group_col_data()
        data_dict_final: Dict[str, List[ndarray]] = defaultdict(list)
        for dtype, arrs in data_dict.items():
            data_dict_final[dtype] = [
                np.repeat(arrs[0], len(calc_columns), axis=0)
            ]

        new_column_info = self._get_new_column_info()
        num_group_cols = len(self._group_columns)
        new_columns = self._group_columns.copy()

        cur_obj_loc = utils.get_num_cols(data_dict_final.get('O', []))
        column_name_array = np.tile(calc_columns, len(self))[:, np.newaxis]
        data_dict_final['O'].append(column_name_array)
        new_columns.append('Column Name')
        new_column_info['Column Name'] = utils.Column('O', cur_obj_loc,
                                                      num_group_cols)

        cur_loc = utils.get_num_cols(data_dict_final.get('f', []))

        for i, col in enumerate(calc_columns):
            new_column_info[col] = utils.Column('f', i + cur_loc,
                                                i + num_group_cols + 1)
            new_columns.append(col)

        data_dict_final['f'].append(result)
        new_data = utils.concat_stat_arrays(data_dict_final)

        return DataFrame._construct_from_new(new_data, new_column_info,
                                             new_columns)
Exemplo n.º 6
0
 def _create_df(self, arr, dtype, columns):
     new_data = {dtype: arr}
     new_column_info = {
         col: utils.Column(dtype, i, i)
         for i, col in enumerate(columns)
     }
     return self._df._construct_from_new(new_data, new_column_info, columns)
Exemplo n.º 7
0
 def size(self):
     name = self._get_agg_name('size')
     new_columns = np.array(self._group_columns + [name], dtype='O')
     size = _gb.size(self._group_labels,
                     len(self._group_position))[:, np.newaxis]
     data_dict = self._get_group_col_data()
     data_dict['i'].append(size)
     new_data = utils.concat_stat_arrays(data_dict)
     new_column_info = self._get_new_column_info()
     new_column_info[name] = utils.Column('i', new_data['i'].shape[1] - 1,
                                          len(new_columns) - 1)
     return DataFrame._construct_from_new(new_data, new_column_info,
                                          new_columns)
Exemplo n.º 8
0
 def cumcount(self) -> DataFrame:
     # todo: add ascending=False
     name = self._get_agg_name('cumcount')
     new_columns = np.array(self._group_columns + [name], dtype='O')
     cumcount = _gb.cumcount(self._group_labels,
                             len(self._group_position))[:, np.newaxis]
     data_dict = self._get_group_col_data_all()
     data_dict['i'].append(cumcount)
     new_data = utils.concat_stat_arrays(data_dict)
     new_column_info = self._get_new_column_info()
     new_column_info[name] = utils.Column('i', new_data['i'].shape[1] - 1,
                                          len(new_columns) - 1)
     return DataFrame._construct_from_new(new_data, new_column_info,
                                          new_columns)
Exemplo n.º 9
0
def read_csv(fp, sep=',', header=0, skiprows=None, usecols=None):
    if not isinstance(sep, str):
        raise TypeError('`sep` must be a string')
    if len(sep) != 1:
        raise ValueError('`sep` must only be one character in length')
    if not isinstance(header, int):
        raise TypeError('`header` must be an integer')
    if header < -1:
        raise ValueError('`header` must be greater than or equal to -1')

    if isinstance(usecols, list):
        if len(usecols) == 0:
            raise ValueError('`usecols` must be a non-empty list of integers or column names')
    elif usecols is not None:
        raise TypeError('`usecols` must be a list of integers or column names')

    nrows = _get_file_legnth(fp)

    skiprows_set = set()
    skiprows_int = 0
    if skiprows is None:
        pass
    elif isinstance(skiprows, int):
        if skiprows < 0:
            raise ValueError('`skiprows` must be one or more non-negative integers')
        skiprows_int = skiprows
    else:
        skiprows_arr = np.asarray(skiprows)
        if (skiprows_arr < 0).any():
            raise ValueError('All values in the `skiprows` sequence must be >= 0')
        if header == -1:
            skiprows_set = set(skiprows_arr)
        else:
            max_row = skiprows_arr.max()
            if header > max_row - len(skiprows_arr):
                header += len(skiprows_arr)
            else:
                max_rows = np.arange(max_row)
                kept_rows = max_rows[~np.isin(max_rows, skiprows_arr)]
                header = kept_rows[header]
                skiprows_set = set(skiprows_arr[skiprows_arr > header])

    tuple_return = _rf.read_csv(fp, nrows, ord(sep), header, skiprows_int, skiprows_set, usecols)

    a_bool, a_int, a_float, a_str, columns, dtypes, dtype_loc = tuple_return

    new_column_info = {}
    dtype_map = {1: 'b', 2: 'i', 3: 'f', 4: 'O'}
    final_dtype_locs = defaultdict(list)
    for i, (col, dtype, loc) in enumerate(zip(columns, dtypes, dtype_loc)):
        new_column_info[col] = utils.Column(dtype_map[dtype], loc, i)
        final_dtype_locs[dtype_map[dtype]].append(loc)

    new_data = {}
    loc_order_changed = set()
    for arr, dtype in zip((a_bool, a_int, a_float, a_str), ('b', 'i', 'f', 'O')):
        num_cols = arr.shape[1]
        if num_cols != 0:
            locs = final_dtype_locs[dtype]
            if len(locs) == num_cols:
                new_data[dtype] = arr
            else:
                loc_order_changed.add(dtype)
                new_data[dtype] = arr[:, locs]

    if loc_order_changed:
        cur_dtype_loc = defaultdict(int)
        for col in columns:
            dtype, loc, order = new_column_info[col].values
            if dtype in loc_order_changed:
                new_column_info[col].loc = cur_dtype_loc[dtype]
                cur_dtype_loc[dtype] += 1
    new_columns = np.array(columns, dtype='O')
    return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
Exemplo n.º 10
0
    def _roll_generic(self, name, columns, **kwargs):
        if columns is None:
            columns = self._df.columns
        elif isinstance(columns, str):
            columns = [columns]
        elif not isinstance(columns, list):
            raise TypeError(
                '`columns` must either be a string, a list of column names, or None'
            )

        col_order = dict(zip(columns, range(len(columns))))

        dtype_locs = defaultdict(list)
        dtype_cols = defaultdict(list)
        col_info = self._df._column_info
        for i, col in enumerate(columns):
            try:
                dtype, loc, order = col_info[col].values
            except KeyError:
                raise KeyError(f'{col} is not a column name')

            dtype_locs[dtype].append(loc)
            dtype_cols[dtype].append(col)

        kept_dtype_loc = defaultdict(list)
        new_col_info = {}
        dtype_ct = defaultdict(int)
        for i, col in enumerate(self._kept_columns):
            dtype, loc, _ = col_info[col].values
            new_loc = len(kept_dtype_loc[dtype])
            kept_dtype_loc[dtype].append(loc)
            new_col_info[col] = utils.Column(dtype, new_loc, i)
            dtype_ct[dtype] += 1

        data_dict = defaultdict(list)
        for dtype, locs in dtype_locs.items():
            func_name = name + '_' + utils.convert_kind_to_dtype_generic(dtype)
            data = self._df._data[dtype]
            result = getattr(_roll, func_name)(data, np.array(locs),
                                               self._left, self._right,
                                               self._min_window, **kwargs)
            result_dtype = result.dtype.kind
            data_dict[result_dtype].append(result)
            for col in dtype_cols[dtype]:
                order = col_order[col]
                new_col = col
                if col in self._kept_columns:
                    new_col = col + '_rolling'
                    columns[columns.index(col)] = new_col
                new_col_info[new_col] = utils.Column(
                    result_dtype, dtype_ct[result_dtype],
                    order + len(self._kept_columns))
                dtype_ct[result_dtype] += 1

        new_data = {}
        for dtype, locs in kept_dtype_loc.items():
            data = self._df._data[dtype][:, locs]
            if data.ndim == 1:
                data = data[:, np.newaxis]
            new_data[dtype] = data

        for dtype, data in data_dict.items():
            if dtype not in new_data:
                new_data[dtype] = np.column_stack((*data, ))
            else:
                new_data[dtype] = np.column_stack((new_data[dtype], *data))

        new_columns = np.concatenate((self._kept_columns, columns))
        return DataFrame._construct_from_new(new_data, new_col_info,
                                             new_columns)
Exemplo n.º 11
0
    def _roll_agg(self,
                  agg_cols: Dict = None,
                  new_names: Dict = None,
                  new_order: Dict = None,
                  num_agg_cols: int = None,
                  func_kwargs: Dict = None):

        col_info = self._df._column_info
        kept_dtype_loc = defaultdict(list)
        new_column_info = {}
        dtype_ct = defaultdict(int)
        for i, col in enumerate(self._kept_columns):
            dtype, loc, _ = col_info[col].values
            new_loc = len(kept_dtype_loc[dtype])
            kept_dtype_loc[dtype].append(loc)
            new_column_info[col] = utils.Column(dtype, new_loc, i)
            dtype_ct[dtype] += 1

        data_dict = defaultdict(list)
        new_columns = self._kept_columns.copy() + [''] * num_agg_cols

        for name, agg_cols in agg_cols.items():

            agg_dtype_locs = defaultdict(list)
            agg_dtype_names = defaultdict(list)
            agg_dtype_new_names = defaultdict(list)
            agg_dtype_order = defaultdict(list)
            non_agg_dtype_locs = defaultdict(list)
            agg_dtype_kwargs = defaultdict(list)

            if isinstance(name, str):
                # name can also be a custom function
                name_kwargs = get_func_kwargs(name)
                ignore_str = name_kwargs.get('ignore_str', True)
                ignore_date = name_kwargs.get('ignore_date', True)
                keep_date_type = name_kwargs.get('keep_date_type', True)
            else:
                ignore_str = False
                ignore_date = False
                keep_date_type = True

            cur_new_names = new_names[name]
            cur_new_order = new_order[name]
            kwargs_list = func_kwargs[name]

            for col in self._df._columns:

                dtype, loc, _ = self._df._column_info[col].values
                try:
                    idx = agg_cols.index(col)
                except ValueError:
                    non_agg_dtype_locs[dtype].append(loc)
                else:
                    agg_dtype_locs[dtype].append(loc)
                    agg_dtype_names[dtype].append(col)
                    agg_dtype_new_names[dtype].append(cur_new_names[idx])
                    agg_dtype_order[dtype].append(cur_new_order[idx])
                    agg_dtype_kwargs[dtype].append(kwargs_list[idx])

            for dtype, data in self._df._data.items():
                if dtype not in agg_dtype_locs:
                    continue
                if ignore_str and dtype == 'O':
                    continue
                if ignore_date and dtype in 'mM':
                    continue

                if dtype in 'mM':
                    data = data.view('int64')

                kwargs = {}
                for kw in agg_dtype_kwargs[dtype]:
                    if kw is not None:
                        kwargs = kw
                        break

                if isinstance(name, str):
                    func_name = name + '_' + utils.convert_kind_to_dtype_generic(
                        dtype)
                else:
                    func_name = 'custom_' + utils.convert_kind_to_dtype_generic(
                        dtype)
                    # 'name' is actually a function here
                    kwargs['func'] = name
                    kwargs['col_dict'] = dict(
                        zip(agg_dtype_locs[dtype], agg_dtype_names[dtype]))

                func = getattr(_roll, func_name)

                arr = func(data, np.array(agg_dtype_locs[dtype]), self._left,
                           self._right, self._min_window, **kwargs)

                if dtype in 'mM' and keep_date_type:
                    new_kind = dtype
                    arr = arr.astype(utils.convert_kind_to_dtype(dtype))
                else:
                    new_kind = arr.dtype.kind

                cur_loc = utils.get_num_cols(data_dict.get(
                    new_kind, [])) + dtype_ct[new_kind]
                data_dict[new_kind].append(arr)

                old_locs = agg_dtype_locs[dtype]
                order = np.argsort(old_locs).tolist()

                cur_names = np.array(agg_dtype_new_names[dtype])[order]
                cur_order = len(self._kept_columns) + np.array(
                    agg_dtype_order[dtype])[order]

                for i, cur_name in enumerate(cur_names):
                    new_column_info[cur_name] = utils.Column(
                        new_kind, cur_loc + i, cur_order[i])
                    new_columns[cur_order[i]] = cur_name

        new_data = {}
        for dtype, locs in kept_dtype_loc.items():
            data = self._df._data[dtype][:, locs]
            if data.ndim == 1:
                data = data[:, np.newaxis]
            new_data[dtype] = data

        for dtype, data in data_dict.items():
            if dtype not in new_data:
                new_data[dtype] = np.column_stack((*data, ))
            else:
                new_data[dtype] = np.column_stack((new_data[dtype], *data))

        return DataFrame._construct_from_new(
            new_data, new_column_info, np.asarray(new_columns, dtype='O'))
Exemplo n.º 12
0
    def _single_agg(self,
                    agg_cols: Dict = None,
                    new_names: Dict = None,
                    new_order: Dict = None,
                    num_agg_cols: int = None,
                    func_kwargs: Dict = None) -> DataFrame:

        labels = self._group_labels
        size = len(self._group_position)

        data_dict = self._get_group_col_data()
        new_column_info = self._get_new_column_info()
        new_columns = self._group_columns.copy() + [''] * num_agg_cols

        for name, agg_cols in agg_cols.items():

            agg_dtype_locs = defaultdict(list)
            agg_dtype_names = defaultdict(list)
            agg_dtype_new_names = defaultdict(list)
            agg_dtype_order = defaultdict(list)
            non_agg_dtype_locs = defaultdict(list)
            agg_dtype_kwargs = defaultdict(list)

            if isinstance(name, str):
                # name can also be a custom function
                name_kwargs = get_func_kwargs(name)
                ignore_str = name_kwargs.get('ignore_str', True)
                add_positions = name_kwargs.get('add_positions', False)
                ignore_date = name_kwargs.get('ignore_date', True)
                keep_date_type = name_kwargs.get('keep_date_type', True)
            else:
                ignore_str = False
                add_positions = False
                ignore_date = False
                keep_date_type = True

            cur_new_names = new_names[name]
            cur_new_order = new_order[name]
            kwargs_list = func_kwargs[name]

            for col in self._df._columns:

                dtype, loc, _ = self._df._column_info[col].values
                try:
                    idx = agg_cols.index(col)
                except ValueError:
                    non_agg_dtype_locs[dtype].append(loc)
                else:
                    agg_dtype_locs[dtype].append(loc)
                    agg_dtype_names[dtype].append(col)
                    agg_dtype_new_names[dtype].append(cur_new_names[idx])
                    agg_dtype_order[dtype].append(cur_new_order[idx])
                    agg_dtype_kwargs[dtype].append(kwargs_list[idx])

            for dtype, data in self._df._data.items():
                if dtype not in agg_dtype_locs:
                    continue
                if ignore_str and dtype == 'O':
                    continue
                if ignore_date and dtype in 'mM':
                    continue

                if dtype in 'mM':
                    data = data.view('int64')

                kwargs = {}
                for kw in agg_dtype_kwargs[dtype]:
                    if kw is not None:
                        kwargs = kw
                        break

                if isinstance(name, str):
                    func_name = name + '_' + utils.convert_kind_to_dtype_generic(
                        dtype)
                else:
                    func_name = 'custom_' + utils.convert_kind_to_dtype_generic(
                        dtype)
                    # 'name' is actually a function here
                    kwargs['func'] = name
                    kwargs['col_dict'] = dict(
                        zip(agg_dtype_locs[dtype], agg_dtype_names[dtype]))

                func = getattr(_gb, func_name)

                if add_positions:
                    arr = func(labels, size, data, non_agg_dtype_locs[dtype],
                               self._group_position, **kwargs)
                else:
                    arr = func(labels, size, data, non_agg_dtype_locs[dtype],
                               **kwargs)

                if dtype in 'mM' and keep_date_type:
                    new_kind = dtype
                    arr = arr.astype(utils.convert_kind_to_dtype(dtype))
                else:
                    new_kind = arr.dtype.kind

                cur_loc = utils.get_num_cols(data_dict.get(new_kind, []))
                data_dict[new_kind].append(arr)

                old_locs = agg_dtype_locs[dtype]
                order = np.argsort(old_locs).tolist()

                cur_names = np.array(agg_dtype_new_names[dtype])[order]
                cur_order = len(self._group_columns) + np.array(
                    agg_dtype_order[dtype])[order]

                for i, cur_name in enumerate(cur_names):
                    new_column_info[cur_name] = utils.Column(
                        new_kind, cur_loc + i, cur_order[i])
                    new_columns[cur_order[i]] = cur_name

        new_data = utils.concat_stat_arrays(data_dict)
        new_columns = np.array(new_columns, dtype='O')
        return DataFrame._construct_from_new(new_data, new_column_info,
                                             new_columns)
Exemplo n.º 13
0
    def _str_generic_concat(self, name, column, keep, return_dtype, **kwargs):
        if not isinstance(keep, (bool, np.bool_)):
            raise TypeError('`keep` must be a boolean')

        if column is None:
            columns = []
            locs = []
            for col in self._df._columns:
                dtype, loc, _ = self._df._column_info[col].values
                if dtype == 'O':
                    columns.append(col)
                    locs.append(loc)
        else:
            columns, locs = self._validate_columns(column)

        data = self._df._data['O']

        count = 0
        if return_dtype != 'O':
            if return_dtype in self._df._data:
                count = self._df._data[return_dtype].shape[1]
        else:
            count = self._df._data['O'].shape[1] - len(columns)

        kwargs['count'] = count

        final_arr, final_cols, group_len = getattr(_sf, name)(data[:, locs], **kwargs)
        dtype_new = final_arr.dtype.kind

        if len(columns) > 1:
            final_cols = np.repeat(columns, group_len).astype('O') + '_' + final_cols
        new_column_info = {}
        new_data = {}
        add_loc = 0
        add_order = 0

        if keep:
            df = self._df.drop(columns=columns)

            if dtype_new in df._data:
                add_loc = df._data[dtype_new].shape[1]
            add_order = df.shape[1]

            for dtype, arr in df._data.items():
                if dtype == dtype_new:
                    for i in range(arr.shape[1]):
                        final_arr[:, i] = arr[:, i]
                    new_data[dtype_new] = final_arr
                else:
                    new_data[dtype] = arr

            if dtype_new not in df._data:
                new_data[dtype_new] = final_arr

            new_column_info = df._copy_column_info()
            new_columns = np.concatenate((df._columns, final_cols))
        else:
            new_data = {dtype_new: final_arr}
            new_columns = final_cols

        for i, col in enumerate(final_cols):
            new_column_info[col] = utils.Column(dtype_new, i + add_loc, i + add_order)

        return self._df._construct_from_new(new_data, new_column_info, new_columns)
Exemplo n.º 14
0
    def _group_agg(self,
                   name: str,
                   ignore_str: bool = True,
                   add_positions: bool = False,
                   keep_group_cols: bool = True,
                   ignore_date: bool = True,
                   keep_date_type: bool = True,
                   **kwargs) -> DataFrame:
        labels = self._group_labels
        size = len(self._group_position)

        old_dtype_col: Dict[str, List[str]] = defaultdict(list)
        for col, col_obj in self._df._column_info.items():
            if col not in self._group_columns:
                old_dtype_col[col_obj.dtype].append(col)

        if keep_group_cols:
            data_dict = self._get_group_col_data()
            new_column_info = self._get_new_column_info()
            new_columns = self._group_columns.copy()
        else:
            data_dict = defaultdict(list)
            new_column_info = {}
            new_columns = []

        for dtype, data in self._df._data.items():
            if ignore_str and dtype == 'O':
                continue
            if ignore_date and dtype in 'mM':
                continue
            # number of grouped columns
            group_locs: list = self._group_dtype_loc.get(dtype, [])
            if len(group_locs) != data.shape[1]:
                func_name = name + '_' + utils.convert_kind_to_dtype_generic(
                    dtype)
                func = getattr(_gb, func_name)
                if dtype in 'mM':
                    data = data.view('int64')

                if add_positions:
                    arr = func(labels, size, data, group_locs,
                               self._group_position, **kwargs)
                else:
                    arr = func(labels, size, data, group_locs, **kwargs)
            else:
                continue

            if dtype in 'mM' and keep_date_type:
                new_kind = dtype
                arr = arr.astype(utils.convert_kind_to_dtype(dtype))
            else:
                new_kind = arr.dtype.kind
            cur_loc = utils.get_num_cols(data_dict.get(new_kind, []))
            data_dict[new_kind].append(arr)

            for col in old_dtype_col[dtype]:
                count_less = 0
                old_kind, old_loc, old_order = self._df._column_info[
                    col].values
                for k in self._group_dtype_loc.get(dtype, []):
                    count_less += old_loc > k

                new_column_info[col] = utils.Column(
                    new_kind, cur_loc + old_loc - count_less, 0)

        i = len(new_columns)
        j = 0
        for col in self._df._columns:
            if col not in new_column_info:
                continue
            if col in self._group_columns and keep_group_cols:
                new_column_info[col].order = j
                j += 1
                continue

            new_columns.append(col)
            new_column_info[col].order = i
            i += 1

        new_data = utils.concat_stat_arrays(data_dict)

        return DataFrame._construct_from_new(new_data, new_column_info,
                                             new_columns)
Exemplo n.º 15
0
 def _get_new_column_info(self) -> ColInfoT:
     new_column_info: ColInfoT = {}
     for col, col_obj in self._column_info.items():
         new_column_info[col] = utils.Column(*col_obj.values)
     return new_column_info
Exemplo n.º 16
0
    def _generic_concat(self, name, column, keep, **kwargs):
        if not isinstance(keep, (bool, np.bool_)):
            raise TypeError('`keep` must be a boolean')

        if column is None:
            columns = []
            locs = []
            for col in self._df._columns:
                dtype, loc, _ = self._df._column_info[col].values
                if dtype == self._dtype_acc:
                    columns.append(col)
                    locs.append(loc)
        else:
            columns, locs = self._validate_columns(column)

        data = self._df._data[self._dtype_acc]
        arrs = []
        all_cols = []
        for loc in locs:
            arr, new_columns = getattr(_sf, name)(data[:, loc], **kwargs)
            arrs.append(arr)
            all_cols.append(new_columns)

        dtype_new = arrs[0].dtype.kind

        if len(arrs) == 1:
            final_arr = arrs[0]
            final_cols = all_cols[0]
        else:
            final_arr = np.column_stack(arrs)
            all_cols_new = []
            for cols, orig_name in zip(all_cols, columns):
                all_cols_new.append(cols + '_' + orig_name)
            final_cols = np.concatenate(all_cols_new)

        new_column_info = {}
        new_data = {}
        add_loc = 0
        add_order = 0
        if keep:
            df = self._df.drop(columns=columns)
            if dtype_new in df._data:
                add_loc = df._data[dtype_new].shape[1]
            add_order = df.shape[1]

            for dtype, arr in df._data.items():
                if dtype == dtype_new:
                    new_data[dtype_new] = np.column_stack((arr, final_arr))
                else:
                    new_data[dtype] = arr.copy('F')

            if dtype_new not in df._data:
                new_data[dtype_new] = final_arr

            new_column_info = df._copy_column_info()
            new_columns = np.concatenate((df._columns, final_cols))
        else:
            new_data = {dtype_new: final_arr}
            new_columns = final_cols

        for i, col in enumerate(final_cols):
            new_column_info[col] = utils.Column(dtype_new, i + add_loc,
                                                i + add_order)

        return self._df._construct_from_new(new_data, new_column_info,
                                            new_columns)
Exemplo n.º 17
0
    def _create_groups(
            self, columns: Union[str, List[str]]) -> Tuple[ndarray, ndarray]:
        self._group_dtype_loc: Dict[str, List[int]] = defaultdict(list)
        self._column_info: ColInfoT = {}
        for i, col in enumerate(columns):
            dtype, loc, _ = self._df._column_info[
                col].values  # type: str, int, int
            cur_loc = len(self._group_dtype_loc[dtype])
            self._group_dtype_loc[dtype].append(loc)
            self._column_info[col] = utils.Column(dtype, cur_loc, i)

        if len(columns) == 1:
            # since there is just one column, dtype is from the for-loop
            final_arr = self._df._data[dtype][:, loc]
            if dtype in 'mM':
                final_arr = final_arr.view('int64')
            dtype = final_arr.dtype.kind
            func_name = 'get_group_assignment_' + utils.convert_kind_to_dtype(
                dtype) + '_1d'
            return getattr(_gb, func_name)(final_arr)
        elif len(self._group_dtype_loc
                 ) == 1 or 'O' not in self._group_dtype_loc:
            arrs = []
            for dtype, locs in self._group_dtype_loc.items():
                arr = self._df._data[dtype][:, locs]
                if dtype in 'mM':
                    arr = arr.view('int64')
                arrs.append(arr)
            if len(arrs) == 1:
                final_arr = arrs[0]
            else:
                final_arr = np.column_stack(arrs)

            dtype = final_arr.dtype.kind
            func_name = 'get_group_assignment_' + utils.convert_kind_to_dtype(
                dtype) + '_2d'
            final_arr = np.ascontiguousarray(final_arr)
            return getattr(_gb, func_name)(final_arr)
        else:
            arrs = []
            for dtype, locs in self._group_dtype_loc.items():
                if dtype == 'O':
                    arr_str = self._df._data['O'][:, locs]
                else:
                    arr = self._df._data[dtype][:, locs]
                    if dtype in 'mM':
                        arr = arr.view('int64')
                    arrs.append(arr)
            if len(arrs) == 1:
                arr_numbers = arrs[0]
            else:
                arr_numbers = np.column_stack(arrs)

            dtype = arr_numbers.dtype.kind
            if arr_str.shape[1] == 1:
                arr_str = arr_str[:, 0]
            if arr_numbers.shape[1] == 1:
                arr_numbers = arr_numbers[:, 0]

            str_ndim = str(arr_str.ndim) + 'd_'
            num_ndim = str(arr_numbers.ndim) + 'd'
            dtype_str = utils.convert_kind_to_dtype(dtype) + '_'
            func_name = 'get_group_assignment_str_' + str_ndim + dtype_str + num_ndim
            arr_numbers = np.ascontiguousarray(arr_numbers)
            return getattr(_gb, func_name)(arr_str, arr_numbers)