Пример #1
0
def convert_columns(s, drop_first):
    if is_string_dtype(s.dtype) and s.map(lambda v: is_string_like(v)).all():
        s = s.astype('category')

    if is_categorical(s):
        out = pd.get_dummies(s, drop_first=drop_first)
        out.columns = [str(s.name) + '.' + str(c) for c in out]
        return out
    return s
Пример #2
0
    def __init__(self, x, var_name='x', nobs=None, convert_dummies=True, drop_first=True):

        if isinstance(x, IVData):
            self.__dict__.update(copy.deepcopy(x.__dict__))
            return
        if x is None and nobs is not None:
            x = np.empty((nobs, 0))
        elif x is None:
            raise ValueError('nobs required when x is None')

        self.original = x
        xndim = x.ndim
        if xndim > 2:
            raise ValueError(dim_err.format(var_name, xndim))

        if isinstance(x, np.ndarray):
            x = x.astype(dtype=np.float64)
            if xndim == 1:
                x.shape = (x.shape[0], -1)

            self._ndarray = x.astype(np.float64)
            index = list(range(x.shape[0]))
            if x.shape[1] == 1:
                cols = [var_name]
            else:
                cols = [var_name + '.{0}'.format(i) for i in range(x.shape[1])]
            self._pandas = pd.DataFrame(x, index=index, columns=cols)
            self._labels = {0: index, 1: cols}

        elif isinstance(x, (pd.Series, pd.DataFrame)):
            if isinstance(x, pd.Series):
                name = var_name if not x.name else x.name
                x = pd.DataFrame({name: x})
            copied = False
            columns = list(x.columns)
            if len(set(columns)) != len(columns):
                raise ValueError('DataFrame contains duplicate column names. '
                                 'All column names must be distinct')
            all_numeric = True
            for col in x:
                c = x[col]
                if is_string_dtype(c.dtype) and \
                        c.map(lambda v: is_string_like(v)).all():

                    c = c.astype('category')
                    if not copied:
                        x = x.copy()
                        copied = True
                    x[col] = c
                dt = c.dtype
                all_numeric = all_numeric and is_numeric_dtype(dt)
                if not (is_numeric_dtype(dt) or is_categorical_dtype(dt)):
                    raise ValueError('Only numeric, string  or categorical '
                                     'data permitted')

            if convert_dummies:
                x = expand_categoricals(x, drop_first)

            self._pandas = x
            self._ndarray = self._pandas.values
            if all_numeric or convert_dummies:
                self._ndarray = self._ndarray.astype(np.float64)
            self._labels = {i: list(label) for i, label in zip(range(x.ndim), x.axes)}

        else:
            try:
                import xarray as xr
            except ImportError:
                raise TypeError(type_err)
            if isinstance(x, xr.DataArray):
                if x.ndim == 1:
                    x = xr.concat([x], dim=var_name).transpose()

                index = list(x.coords[x.dims[0]].values)
                cols = x.coords[x.dims[1]].values
                if is_numeric_dtype(cols.dtype):
                    cols = [var_name + '.{0}'.format(i) for i in range(x.shape[1])]
                cols = list(cols)
                self._ndarray = x.values.astype(np.float64)
                self._pandas = pd.DataFrame(self._ndarray, columns=cols,
                                            index=index)
                self._labels = {0: index, 1: cols}
            else:
                raise TypeError(type_err)

        if nobs is not None:
            if self._ndarray.shape[0] != nobs:
                msg = 'Array required to have {nobs} obs, has ' \
                      '{act}'.format(nobs=nobs, act=self._ndarray.shape[0])
                raise ValueError(msg)