def test_string_nonconversion(): t, n = 3, 1000 string = np.random.choice(['a', 'b', 'c'], (t, n)) num = np.random.randn(t, n) p = pd.Panel({'a': string, 'b': num}) panel = PanelData(p, var_name='OtherEffect', convert_dummies=False) assert is_string_dtype(panel.dataframe['a'].dtype) assert np.all(panel.dataframe['a'] == string.T.ravel())
def convert_columns(s, drop_first): if is_string_dtype(s.dtype) and s.map(lambda v: is_string_like(v)).all(): s = s.astype('category') if is_categorical(s): out = pd.get_dummies(s, drop_first=drop_first) out.columns = [str(s.name) + '.' + str(c) for c in out] return out return s
def test_string_nonconversion(): t, n = 3, 1000 string = np.random.choice(['a', 'b', 'c'], (t, n)) num = np.random.randn(t, n) time = date_range('1-1-2000', periods=t) entities = ['entity.{0}'.format(i) for i in range(n)] p = panel_to_frame(None, items=['a', 'b'], major_axis=time, minor_axis=entities, swap=True) p['a'] = string.T.ravel() p['b'] = num.T.ravel() panel = PanelData(p, var_name='OtherEffect', convert_dummies=False) assert is_string_dtype(panel.dataframe['a'].dtype) assert np.all(panel.dataframe['a'] == string.T.ravel())
def __init__(self, x, var_name='x', nobs=None, convert_dummies=True, drop_first=True): if isinstance(x, IVData): self.__dict__.update(copy.deepcopy(x.__dict__)) return if x is None and nobs is not None: x = np.empty((nobs, 0)) elif x is None: raise ValueError('nobs required when x is None') self.original = x xndim = x.ndim if xndim > 2: raise ValueError(dim_err.format(var_name, xndim)) if isinstance(x, np.ndarray): x = x.astype(dtype=np.float64) if xndim == 1: x.shape = (x.shape[0], -1) self._ndarray = x.astype(np.float64) index = list(range(x.shape[0])) if x.shape[1] == 1: cols = [var_name] else: cols = [var_name + '.{0}'.format(i) for i in range(x.shape[1])] self._pandas = pd.DataFrame(x, index=index, columns=cols) self._labels = {0: index, 1: cols} elif isinstance(x, (pd.Series, pd.DataFrame)): if isinstance(x, pd.Series): name = var_name if not x.name else x.name x = pd.DataFrame({name: x}) copied = False columns = list(x.columns) if len(set(columns)) != len(columns): raise ValueError('DataFrame contains duplicate column names. ' 'All column names must be distinct') all_numeric = True for col in x: c = x[col] if is_string_dtype(c.dtype) and \ c.map(lambda v: is_string_like(v)).all(): c = c.astype('category') if not copied: x = x.copy() copied = True x[col] = c dt = c.dtype all_numeric = all_numeric and is_numeric_dtype(dt) if not (is_numeric_dtype(dt) or is_categorical_dtype(dt)): raise ValueError('Only numeric, string or categorical ' 'data permitted') if convert_dummies: x = expand_categoricals(x, drop_first) self._pandas = x self._ndarray = self._pandas.values if all_numeric or convert_dummies: self._ndarray = self._ndarray.astype(np.float64) self._labels = {i: list(label) for i, label in zip(range(x.ndim), x.axes)} else: try: import xarray as xr except ImportError: raise TypeError(type_err) if isinstance(x, xr.DataArray): if x.ndim == 1: x = xr.concat([x], dim=var_name).transpose() index = list(x.coords[x.dims[0]].values) cols = x.coords[x.dims[1]].values if is_numeric_dtype(cols.dtype): cols = [var_name + '.{0}'.format(i) for i in range(x.shape[1])] cols = list(cols) self._ndarray = x.values.astype(np.float64) self._pandas = pd.DataFrame(self._ndarray, columns=cols, index=index) self._labels = {0: index, 1: cols} else: raise TypeError(type_err) if nobs is not None: if self._ndarray.shape[0] != nobs: msg = 'Array required to have {nobs} obs, has ' \ '{act}'.format(nobs=nobs, act=self._ndarray.shape[0]) raise ValueError(msg)