示例#1
0
def test_demean_weighted(data):
    x = PanelData(data.x)
    w = PanelData(data.w)
    missing = x.isnull | w.isnull
    x.drop(missing)
    w.drop(missing)

    entity_demean = x.demean("entity", weights=w)
    d = get_dummies(Categorical(get_codes(x.index)[0]))
    d = d.values
    root_w = np.sqrt(w.values2d)
    wx = root_w * x.values2d
    wd = d * root_w
    mu = wd @ lstsq(wd, wx, rcond=None)[0]
    e = wx - mu
    assert_allclose(1 + np.abs(entity_demean.values2d), 1 + np.abs(e))

    time_demean = x.demean("time", weights=w)
    d = get_dummies(Categorical(get_codes(x.index)[1]))
    d = d.values
    root_w = np.sqrt(w.values2d)
    wx = root_w * x.values2d
    wd = d * root_w
    mu = wd @ lstsq(wd, wx, rcond=None)[0]
    e = wx - mu
    assert_allclose(1 + np.abs(time_demean.values2d), 1 + np.abs(e))
示例#2
0
def test_mean_weighted(data):
    x = PanelData(data.x)
    w = PanelData(data.w)
    missing = x.isnull | w.isnull
    x.drop(missing)
    w.drop(missing)
    entity_mean = x.mean("entity", weights=w)
    c = x.index.levels[0][get_codes(x.index)[0]]
    d = get_dummies(Categorical(c, ordered=True))
    d = d[entity_mean.index]
    d = d.values
    root_w = np.sqrt(w.values2d)
    wx = root_w * x.values2d
    wd = d * root_w
    mu = lstsq(wd, wx, rcond=None)[0]
    assert_allclose(entity_mean, mu)

    time_mean = x.mean("time", weights=w)
    c = x.index.levels[1][get_codes(x.index)[1]]
    d = get_dummies(Categorical(c, ordered=True))
    d = d[list(time_mean.index)]
    d = d.values
    root_w = np.sqrt(w.values2d)
    wx = root_w * x.values2d
    wd = d * root_w
    mu = pinv(wd) @ wx
    assert_allclose(time_mean, mu)
示例#3
0
 def __init__(self, df: DataFrame):
     self._items = df.columns
     index = df.index
     self._major_axis = Index(index.levels[1][get_codes(index)[1]]).unique()
     self._minor_axis = Index(index.levels[0][get_codes(index)[0]]).unique()
     self._full_index = MultiIndex.from_product(
         [self._minor_axis, self._major_axis])
     new_df = df.reindex(self._full_index)
     new_df.index.names = df.index.names
     self._frame = new_df
     i, j, k = len(self._items), len(self._major_axis), len(self.minor_axis)
     self._shape = (i, j, k)
     self._values = np.swapaxes(
         np.reshape(np.asarray(new_df).copy().T, (i, k, j)), 1, 2)
示例#4
0
    def dummies(self,
                group: str = "entity",
                drop_first: bool = False) -> DataFrame:
        """
        Generate entity or time dummies

        Parameters
        ----------
        group : {'entity', 'time'}, optional
            Type of dummies to generate
        drop_first : bool, optional
            Flag indicating that the dummy column corresponding to the first
            entity or time period should be dropped

        Returns
        -------
        DataFrame
            Dummy variables
        """
        if group not in ("entity", "time"):
            raise ValueError
        axis = 0 if group == "entity" else 1
        labels = get_codes(self._frame.index)
        levels = self._frame.index.levels
        cat = Categorical(levels[axis][labels[axis]])
        dummies = get_dummies(cat, drop_first=drop_first)
        cols = self.entities if group == "entity" else self.time
        return dummies[[c for c in cols if c in dummies]].astype(np.float64,
                                                                 copy=False)
示例#5
0
def test_absorbing_regressors(cat, cont, interact, weights):
    areg = AbsorbingRegressor(cat=cat,
                              cont=cont,
                              interactions=interact,
                              weights=weights)
    rank = areg.approx_rank
    expected_rank = 0

    expected = []
    for i, col in enumerate(cat):
        expected_rank += pd.Series(get_codes(cat[col].cat)).nunique() - (i > 0)
    expected.append(dummy_matrix(cat, precondition=False)[0])
    expected_rank += cont.shape[1]
    expected.append(csc_matrix(cont))
    if interact is not None:
        for inter in interact:
            interact_mat = inter.sparse
            expected_rank += interact_mat.shape[1]
            expected.append(interact_mat)
    expected = sp.hstack(expected, format="csc")
    if weights is not None:
        expected = (sp.diags(np.sqrt(weights)).dot(expected)).asformat("csc")
    actual = areg.regressors
    assert expected.shape == actual.shape
    assert_array_equal(expected.indptr, actual.indptr)
    assert_array_equal(expected.indices, actual.indices)
    assert_allclose(expected.A, actual.A)
    assert expected_rank == rank
示例#6
0
def category_continuous_interaction(cat: AnyPandas,
                                    cont: AnyPandas,
                                    precondition: bool = True) -> csc_matrix:
    """
    Parameters
    ----------
    cat : Series
        Categorical series to convert to dummy variables
    cont : {Series, DataFrame}
        Continuous variable values to use in the dummy interaction
    precondition : bool
        Flag whether dummies should be preconditioned

    Returns
    -------
    csc_matrix
        Sparse matrix of dummy interactions with unit column norm
    """
    codes = get_codes(category_product(cat).cat)
    interact = csc_matrix(
        (to_numpy(cont).flat, (arange(codes.shape[0]), codes)))
    if not precondition:
        return interact
    else:
        contioned = preconditioner(interact)[0]
        assert isinstance(contioned, csc_matrix)
        return contioned
示例#7
0
 def hash(self) -> Tuple[Tuple[str, ...], ...]:
     hashes: List[Tuple[str, ...]] = []
     hasher = hash_func()
     if self._cat is not None:
         for col in self._cat:
             hasher.update(
                 ascontiguousarray(
                     to_numpy(get_codes(self._cat[col].cat)).data))
             hashes.append((hasher.hexdigest(), ))
             hasher = _reset(hasher)
     if self._cont is not None:
         for col in self._cont:
             hasher.update(ascontiguousarray(
                 to_numpy(self._cont[col]).data))
             hashes.append((hasher.hexdigest(), ))
             hasher = _reset(hasher)
     if self._interactions is not None:
         for interact in self._interactions:
             hashes.extend(interact.hash)
     # Add weight hash if provided
     if self._weights is not None:
         hasher = hash_func()
         hasher.update(ascontiguousarray(self._weights.data))
         hashes.append((hasher.hexdigest(), ))
     return tuple(sorted(hashes))
示例#8
0
def test_fitted_effects_residuals(both_data_types):
    mod = BetweenOLS(both_data_types.y, both_data_types.x)
    res = mod.fit(reweight=True, debiased=False)
    expected = pd.DataFrame(
        mod.exog.values2d @ res.params.values,
        mod.dependent.index,
        columns=["fitted_values"],
    )
    assert_allclose(expected, res.fitted_values)
    assert_frame_similar(res.fitted_values, expected)

    index = mod.dependent.dataframe.index
    reindex = index.levels[0][get_codes(index)[0]]
    resids = res.resids.copy()
    resids = resids.reindex(reindex)
    resids.index = index
    expected = pd.DataFrame(resids)
    expected.columns = ["estimated_effects"]
    assert_allclose(expected, res.estimated_effects)
    assert_frame_similar(res.estimated_effects, expected)

    fitted_effects = res.fitted_values.values + res.estimated_effects.values
    expected.iloc[:, 0] = mod.dependent.values2d - fitted_effects
    expected.columns = ["idiosyncratic"]
    assert_allclose(expected, res.idiosyncratic, atol=1e-8)
    assert_frame_similar(res.idiosyncratic, expected)
示例#9
0
def category_product(cats: AnyPandas) -> Series:
    """
    Construct category from all combination of input categories

    Parameters
    ----------
    cats : {Series, DataFrame}
        DataFrame containing categorical variables.  If cats is a Series, cats
        is returned unmodified.

    Returns
    -------
    Series
        Categorical series containing the cartesian product of the categories
        in cats
    """
    if isinstance(cats, Series):
        return cats

    sizes = []
    for c in cats:
        if not is_categorical(cats[c]):
            raise TypeError("cats must contain only categorical variables")
        col = cats[c]
        max_code = get_codes(col.cat).max()
        size = 1
        while max_code >= 2**size:
            size += 1
        sizes.append(size)
    nobs = cats.shape[0]
    total_size = sum(sizes)
    if total_size >= 63:
        raise ValueError(
            "There are too many cats with too many states to use this method.")
    dtype_size = min(filter(lambda v: total_size < (v - 1), (8, 16, 32, 64)))
    dtype_str = "int{0:d}".format(dtype_size)
    dtype_val = dtype(dtype_str)
    codes = zeros(nobs, dtype=dtype_val)
    cum_size = 0
    for i, col in enumerate(cats):
        codes += get_codes(cats[col].cat).astype(
            dtype_val) << SCALAR_DTYPES[dtype_str](cum_size)
        cum_size += sizes[i]
    return Series(Categorical(codes), index=cats.index)
示例#10
0
    def entity_ids(self) -> NDArray:
        """
        Get array containing entity group membership information

        Returns
        -------
        ndarray
            2d array containing entity ids corresponding dataframe view
        """
        return np.asarray(get_codes(self._frame.index)[0])[:, None]
示例#11
0
    def time_ids(self) -> NDArray:
        """
        Get array containing time membership information

        Returns
        -------
        ndarray
            2d array containing time ids corresponding dataframe view
        """
        return np.asarray(get_codes(self._frame.index)[1])[:, None]
示例#12
0
def absorbed_data(request):
    datatype = request.param
    rng = np.random.RandomState(12345)
    data = generate_data(0, datatype, ntk=(131, 4, 3), rng=rng)
    x = data.x
    if isinstance(data.x, np.ndarray):
        absorbed = np.arange(x.shape[2])
        absorbed = np.tile(absorbed, (1, x.shape[1], 1))
        data.x = np.concatenate([data.x, absorbed])
    elif isinstance(data.x, pd.DataFrame):
        codes = get_codes(data.x.index)
        absorbed = np.array(codes[0]).astype(np.double)
        data.x["x_absorbed"] = absorbed
    return data
示例#13
0
def category_interaction(cat: Series, precondition: bool = True) -> csc_matrix:
    """
    Parameters
    ----------
    cat : Series
        Categorical series to convert to dummy variables
    precondition : bool
        Flag whether dummies should be preconditioned

    Returns
    -------
    dummies : csc_matrix
        Sparse matrix of dummies with unit column norm
    """
    codes = get_codes(category_product(cat).cat)
    return dummy_matrix(codes[:, None], precondition=precondition)[0]
示例#14
0
def test_absorbing_regressors_hash(cat, cont, interact, weights):
    areg = AbsorbingRegressor(cat=cat,
                              cont=cont,
                              interactions=interact,
                              weights=weights)
    # Build hash
    hashes = []
    for col in cat:
        hashes.append(
            (hasher.single(to_numpy(get_codes(cat[col].cat)).data), ))
    for col in cont:
        hashes.append((hasher.single(to_numpy(cont[col]).data), ))
    hashes = sorted(hashes)
    if interact is not None:
        for inter in interact:
            hashes.extend(inter.hash)
    if weights is not None:
        hashes.append((hasher.single(weights.data), ))
    hashes = tuple(sorted(hashes))
    assert hashes == areg.hash
示例#15
0
    def hash(self):
        """
        Construct a hash that will be invariant for any permutation of
        inputs that produce the same fit when used as regressors"""
        # Sorted hashes of any categoricals
        hasher = hash_func()
        cat_hashes = []
        cat = self.cat
        for col in cat:
            hasher.update(ascontiguousarray(to_numpy(get_codes(self.cat[col].cat)).data))
            cat_hashes.append(hasher.hexdigest())
            hasher = _reset(hasher)
        cat_hashes = tuple(sorted(cat_hashes))

        hashes = []
        cont = self.cont
        for col in cont:
            hasher.update(ascontiguousarray(to_numpy(cont[col]).data))
            hashes.append(cat_hashes + (hasher.hexdigest(),))
            hasher = _reset(hasher)

        return sorted(hashes)
示例#16
0
 def time(self) -> List[Label]:
     """List of time index names"""
     index = self._frame.index
     return list(index.levels[1][get_codes(index)[1]].unique())
示例#17
0
 def entities(self) -> List[Label]:
     """List of entity index names"""
     index = self._frame.index
     return list(index.levels[0][get_codes(index)[0]].unique())