示例#1
0
    def _check_data(self):
        cat, cont = self._cat, self._cont
        cat_nobs = getattr(cat, 'shape', (0, ))[0]
        cont_nobs = getattr(cont, 'shape', (0, ))[0]
        nobs = max(cat_nobs, cont_nobs)
        if cat is None and cont is None:
            if self._nobs is not None:
                self._cont_data = self._cat_data = IVData(None,
                                                          'none',
                                                          nobs=self._nobs)
            else:
                raise ValueError(
                    'nobs must be provided when cat and cont are None')
            return
        self._nobs = nobs

        self._cat_data = IVData(cat, 'cat', nobs=nobs, convert_dummies=False)
        self._cont_data = IVData(cont,
                                 'cont',
                                 nobs=nobs,
                                 convert_dummies=False)
        if self._cat_data.shape[1] == self._cont_data.shape[1] == 0:
            raise ValueError('Both cat and cont are empty arrays')
        cat_data = self._cat_data.pandas
        convert = [
            col for col in cat_data if not (is_categorical(cat_data[col]))
        ]
        if convert:
            cat_data = DataFrame(
                {col: cat_data[col].astype('category')
                 for col in cat_data})
            self._cat_data = IVData(cat_data, 'cat', convert_dummies=False)
示例#2
0
def convert_columns(s, drop_first):
    if is_string_dtype(s.dtype) and s.map(lambda v: is_string_like(v)).all():
        s = s.astype('category')

    if is_categorical(s):
        out = pd.get_dummies(s, drop_first=drop_first)
        out.columns = [str(s.name) + '.' + str(c) for c in out]
        return out
    return s
示例#3
0
def category_product(cats: AnyPandas) -> Series:
    """
    Construct category from all combination of input categories

    Parameters
    ----------
    cats : {Series, DataFrame}
        DataFrame containing categorical variables.  If cats is a Series, cats
        is returned unmodified.

    Returns
    -------
    cp : Series
        Categorical series containing the cartesian product of the categories
        in cats
    """
    if isinstance(cats, Series):
        return cats

    sizes = []
    for c in cats:
        if not is_categorical(cats[c]):
            raise TypeError('cats must contain only categorical variables')
        col = cats[c]
        max_code = get_codes(col.cat).max()
        size = 1
        while max_code >= 2**size:
            size += 1
        sizes.append(size)
    nobs = cats.shape[0]
    total_size = sum(sizes)
    if total_size >= 63:
        raise ValueError(
            'There are too many cats with too many states to use this method.')
    dtype_size = min(filter(lambda v: total_size < (v - 1), (8, 16, 32, 64)))
    dtype_str = 'int{0:d}'.format(dtype_size)
    dtype_val = dtype(dtype_str)
    codes = zeros(nobs, dtype=dtype_val)
    cum_size = 0
    for i, col in enumerate(cats):
        codes += (get_codes(cats[col].cat).astype(dtype_val) <<
                  SCALAR_DTYPES[dtype_str](cum_size))
        cum_size += sizes[i]
    return Series(Categorical(codes), index=cats.index)
示例#4
0
    def from_frame(frame: DataFrame) -> 'Interaction':
        """
        Convenience function the simplifies using a DataFrame

        Parameters
        ----------
        frame : DataFrame
            Frame containing categorical and continuous variables. All
            categorical variables are passed to `cat` and all other
            variables are passed as `cont`.

        Returns
        -------
        interaction : Interaction
            Instance using the columns of frame

        Examples
        --------
        >>> import numpy as np
        >>> from linearmodels.iv.absorbing import Interaction
        >>> import pandas as pd
        >>> rs = np.random.RandomState(0)
        >>> n = 100000
        >>> cats = pd.concat([pd.Series(pd.Categorical(rs.randint(i+2,size=n)))
        ...                  for i in range(4)],1)
        >>> cats.columns = ['cat{0}'.format(i) for i in range(4)]
        >>> columns = ['cont{0}'.format(i) for i in range(6)]
        >>> cont = pd.DataFrame(rs.standard_normal((n, 6)), columns=columns)
        >>> frame = pd.concat([cats, cont], 1)
        >>> interact = Interaction.from_frame(frame)
        >>> interact.sparse.shape # Cart product of all cats, 5!, times ncont, 6
        (100000, 720)
        """
        cat_cols = [col for col in frame if is_categorical(frame[col])]
        cont_cols = [col for col in frame if col not in cat_cols]
        return Interaction(frame[cat_cols],
                           frame[cont_cols],
                           nobs=frame.shape[0])
示例#5
0
def convert_columns(s, drop_first):
    if is_categorical(s):
        out = pd.get_dummies(s, drop_first=drop_first)
        out.columns = [str(s.name) + '.' + str(c) for c in out]
        return out
    return s