def _check_data(self): cat, cont = self._cat, self._cont cat_nobs = getattr(cat, 'shape', (0, ))[0] cont_nobs = getattr(cont, 'shape', (0, ))[0] nobs = max(cat_nobs, cont_nobs) if cat is None and cont is None: if self._nobs is not None: self._cont_data = self._cat_data = IVData(None, 'none', nobs=self._nobs) else: raise ValueError( 'nobs must be provided when cat and cont are None') return self._nobs = nobs self._cat_data = IVData(cat, 'cat', nobs=nobs, convert_dummies=False) self._cont_data = IVData(cont, 'cont', nobs=nobs, convert_dummies=False) if self._cat_data.shape[1] == self._cont_data.shape[1] == 0: raise ValueError('Both cat and cont are empty arrays') cat_data = self._cat_data.pandas convert = [ col for col in cat_data if not (is_categorical(cat_data[col])) ] if convert: cat_data = DataFrame( {col: cat_data[col].astype('category') for col in cat_data}) self._cat_data = IVData(cat_data, 'cat', convert_dummies=False)
def convert_columns(s, drop_first): if is_string_dtype(s.dtype) and s.map(lambda v: is_string_like(v)).all(): s = s.astype('category') if is_categorical(s): out = pd.get_dummies(s, drop_first=drop_first) out.columns = [str(s.name) + '.' + str(c) for c in out] return out return s
def category_product(cats: AnyPandas) -> Series: """ Construct category from all combination of input categories Parameters ---------- cats : {Series, DataFrame} DataFrame containing categorical variables. If cats is a Series, cats is returned unmodified. Returns ------- cp : Series Categorical series containing the cartesian product of the categories in cats """ if isinstance(cats, Series): return cats sizes = [] for c in cats: if not is_categorical(cats[c]): raise TypeError('cats must contain only categorical variables') col = cats[c] max_code = get_codes(col.cat).max() size = 1 while max_code >= 2**size: size += 1 sizes.append(size) nobs = cats.shape[0] total_size = sum(sizes) if total_size >= 63: raise ValueError( 'There are too many cats with too many states to use this method.') dtype_size = min(filter(lambda v: total_size < (v - 1), (8, 16, 32, 64))) dtype_str = 'int{0:d}'.format(dtype_size) dtype_val = dtype(dtype_str) codes = zeros(nobs, dtype=dtype_val) cum_size = 0 for i, col in enumerate(cats): codes += (get_codes(cats[col].cat).astype(dtype_val) << SCALAR_DTYPES[dtype_str](cum_size)) cum_size += sizes[i] return Series(Categorical(codes), index=cats.index)
def from_frame(frame: DataFrame) -> 'Interaction': """ Convenience function the simplifies using a DataFrame Parameters ---------- frame : DataFrame Frame containing categorical and continuous variables. All categorical variables are passed to `cat` and all other variables are passed as `cont`. Returns ------- interaction : Interaction Instance using the columns of frame Examples -------- >>> import numpy as np >>> from linearmodels.iv.absorbing import Interaction >>> import pandas as pd >>> rs = np.random.RandomState(0) >>> n = 100000 >>> cats = pd.concat([pd.Series(pd.Categorical(rs.randint(i+2,size=n))) ... for i in range(4)],1) >>> cats.columns = ['cat{0}'.format(i) for i in range(4)] >>> columns = ['cont{0}'.format(i) for i in range(6)] >>> cont = pd.DataFrame(rs.standard_normal((n, 6)), columns=columns) >>> frame = pd.concat([cats, cont], 1) >>> interact = Interaction.from_frame(frame) >>> interact.sparse.shape # Cart product of all cats, 5!, times ncont, 6 (100000, 720) """ cat_cols = [col for col in frame if is_categorical(frame[col])] cont_cols = [col for col in frame if col not in cat_cols] return Interaction(frame[cat_cols], frame[cont_cols], nobs=frame.shape[0])
def convert_columns(s, drop_first): if is_categorical(s): out = pd.get_dummies(s, drop_first=drop_first) out.columns = [str(s.name) + '.' + str(c) for c in out] return out return s