def test_panel_to_midf(): x = np.random.standard_normal((3, 7, 100)) df = panel_to_frame(x, list(range(3)), list(range(7)), list(range(100))) mi = pd.MultiIndex.from_product([list(range(7)), list(range(100))]) expected = pd.DataFrame(index=mi, columns=[0, 1, 2]) for i in range(3): expected[i] = x[i].ravel() expected.index.names = ["major", "minor"] pd.testing.assert_frame_equal(df, expected) expected2 = expected.copy() expected2 = expected2.sort_index(level=[1, 0]) expected2.index = expected2.index.swaplevel(0, 1) expected2.index.names = ["major", "minor"] df2 = panel_to_frame(x, list(range(3)), list(range(7)), list(range(100)), True) pd.testing.assert_frame_equal(df2, expected2) entities = list( map(''.join, [[random.choice(string.ascii_lowercase) for __ in range(10)] for _ in range(100)])) times = pd.date_range('1999-12-31', freq='A-DEC', periods=7) var_names = ['x.{0}'.format(i) for i in range(1, 4)] df3 = panel_to_frame(x, var_names, times, entities, True) mi = pd.MultiIndex.from_product([times, entities]) expected3 = pd.DataFrame(index=mi, columns=var_names) for i in range(1, 4): expected3['x.{0}'.format(i)] = x[i - 1].ravel() expected3.index = expected3.index.swaplevel(0, 1) mi = pd.MultiIndex.from_product([entities, times]) expected3 = expected3.loc[mi] expected3.index.names = ["major", "minor"] pd.testing.assert_frame_equal(df3, expected3)
def test_string_conversion(): t, n = 3, 1000 string = np.random.choice(['a', 'b', 'c'], (t, n)) num = np.random.randn(t, n) time = date_range('1-1-2000', periods=t) entities = ['entity.{0}'.format(i) for i in range(n)] p = panel_to_frame(None, items=['a', 'b'], major_axis=time, minor_axis=entities, swap=True) p['a'] = string.T.ravel() p['b'] = num.T.ravel() p = p[['a', 'b']] panel = PanelData(p, var_name='OtherEffect') df = panel.dataframe assert df.shape == (3000, 3) s = string.T.ravel() a_locs = np.where(s == 'a') b_locs = np.where(s == 'b') c_locs = np.where(s == 'c') assert np.all(df.loc[:, 'a.b'].values[a_locs] == 0.0) assert np.all(df.loc[:, 'a.b'].values[b_locs] == 1.0) assert np.all(df.loc[:, 'a.b'].values[c_locs] == 0.0) assert np.all(df.loc[:, 'a.c'].values[a_locs] == 0.0) assert np.all(df.loc[:, 'a.c'].values[b_locs] == 0.0) assert np.all(df.loc[:, 'a.c'].values[c_locs] == 1.0)
def test_string_conversion(): t, n = 3, 1000 string = np.random.choice(["a", "b", "c"], (t, n)) num = np.random.randn(t, n) time = date_range("1-1-2000", periods=t) entities = ["entity.{0}".format(i) for i in range(n)] p = panel_to_frame(None, items=["a", "b"], major_axis=time, minor_axis=entities, swap=True) p["a"] = string.T.ravel() p["b"] = num.T.ravel() p = p[["a", "b"]] panel = PanelData(p, var_name="OtherEffect") df = panel.dataframe assert df.shape == (3000, 3) s = string.T.ravel() a_locs = np.where(s == "a") b_locs = np.where(s == "b") c_locs = np.where(s == "c") assert np.all(df.loc[:, "a.b"].values[a_locs] == 0.0) assert np.all(df.loc[:, "a.b"].values[b_locs] == 1.0) assert np.all(df.loc[:, "a.b"].values[c_locs] == 0.0) assert np.all(df.loc[:, "a.c"].values[a_locs] == 0.0) assert np.all(df.loc[:, "a.c"].values[b_locs] == 0.0) assert np.all(df.loc[:, "a.c"].values[c_locs] == 1.0)
def mi_df(): np.random.seed(12345) n, t, k = 11, 7, 3 x = np.random.standard_normal((k, t, n)) major = date_range("12-31-1999", periods=7) items = ["var.{0}".format(i) for i in range(1, k + 1)] minor = ["entities.{0}".format(i) for i in range(1, n + 1)] return panel_to_frame(x, items, major, minor, swap=True)
def test_panel_to_midf(): x = np.random.standard_normal((3, 7, 100)) expected = pd.Panel(x).to_frame() df = panel_to_frame(x, list(range(3)), list(range(7)), list(range(100))) pd.testing.assert_frame_equal(df, expected) expected = pd.Panel(x).swapaxes(1, 2).to_frame(filter_observations=False) df = panel_to_frame(x, list(range(3)), list(range(7)), list(range(100)), True) pd.testing.assert_frame_equal(df, expected) entities = list(map(''.join, [[random.choice(string.ascii_lowercase) for __ in range(10)] for _ in range(100)])) times = pd.date_range('1999-12-31', freq='A-DEC', periods=7) var_names = ['x.{0}'.format(i) for i in range(1, 4)] expected = pd.Panel(x, items=var_names, major_axis=times, minor_axis=entities) expected = expected.swapaxes(1, 2).to_frame(filter_observations=False) df = panel_to_frame(x, var_names, times, entities, True) pd.testing.assert_frame_equal(df, expected)
def test_pandas_multiindex_dataframe(): n, t, k = 11, 7, 3 x = np.random.random((n, t, k)) major = date_range('12-31-1999', periods=7) minor = ['var.{0}'.format(i) for i in range(1, k + 1)] items = ['item.{0}'.format(i) for i in range(1, n + 1)] x = panel_to_frame(x, items=items, major_axis=major, minor_axis=minor, swap=True) PanelData(x)
def test_incorrect_time_axis(): x = np.random.randn(3, 3, 1000) entities = ['entity.{0}'.format(i) for i in range(1000)] time = ['time.{0}'.format(i) for i in range(3)] var_names = ['var.{0}'.format(i) for i in range(3)] p = panel_to_frame(x, items=var_names, major_axis=time, minor_axis=entities, swap=True) with pytest.raises(ValueError): PanelData(p) time = [1, 2, 3] var_names = ['var.{0}'.format(i) for i in range(3)] p = panel_to_frame(x, items=var_names, major_axis=time, minor_axis=entities, swap=True) p.index = p.index.set_levels([1, datetime(1960, 1, 1), 'a'], 1) with pytest.raises(ValueError): PanelData(p)
def test_existing_panel_data(): n, t, k = 11, 7, 3 x = np.random.random((k, t, n)) major = date_range('12-31-1999', periods=7) items = ['var.{0}'.format(i) for i in range(1, k + 1)] minor = ['entities.{0}'.format(i) for i in range(1, n + 1)] x = panel_to_frame(x, items=items, major_axis=major, minor_axis=minor, swap=True) dh = PanelData(x) dh2 = PanelData(dh) assert_frame_equal(dh.dataframe, dh2.dataframe)
def test_string_nonconversion(): t, n = 3, 1000 string = np.random.choice(['a', 'b', 'c'], (t, n)) num = np.random.randn(t, n) time = date_range('1-1-2000', periods=t) entities = ['entity.{0}'.format(i) for i in range(n)] p = panel_to_frame(None, items=['a', 'b'], major_axis=time, minor_axis=entities, swap=True) p['a'] = string.T.ravel() p['b'] = num.T.ravel() panel = PanelData(p, var_name='OtherEffect', convert_dummies=False) assert is_string_dtype(panel.dataframe['a'].dtype) assert np.all(panel.dataframe['a'] == string.T.ravel())
def first_difference(self): """ Compute first differences of variables Returns ------- diffs : PanelData Differenced values """ diffs = self.panel.values diffs = diffs[:, 1:] - diffs[:, :-1] diffs = panel_to_frame(diffs, self.panel.items, self.panel.major_axis[1:], self.panel.minor_axis, True) diffs = diffs.reindex(self._frame.index).dropna(how='any') return PanelData(diffs)
def test_string_nonconversion(): t, n = 3, 1000 string = np.random.choice(["a", "b", "c"], (t, n)) num = np.random.randn(t, n) time = date_range("1-1-2000", periods=t) entities = ["entity.{0}".format(i) for i in range(n)] p = panel_to_frame(None, items=["a", "b"], major_axis=time, minor_axis=entities, swap=True) p["a"] = string.T.ravel() p["b"] = num.T.ravel() panel = PanelData(p, var_name="OtherEffect", convert_dummies=False) assert is_string_dtype(panel.dataframe["a"].dtype) assert np.all(panel.dataframe["a"] == string.T.ravel())
def test_demean_both_large_t(): x = np.random.standard_normal((1, 100, 10)) time = date_range("1-1-2000", periods=100) entities = ["entity.{0}".format(i) for i in range(10)] data = panel_to_frame(x, ["x"], time, entities, swap=True) data = PanelData(data) demeaned = data.demean("both") df = data.dataframe no_index = df.reset_index() cat = Categorical(no_index[df.index.levels[0].name]) d1 = get_dummies(cat, drop_first=False).astype(np.float64) cat = Categorical(no_index[df.index.levels[1].name]) d2 = get_dummies(cat, drop_first=True).astype(np.float64) d = np.c_[d1.values, d2.values] dummy_demeaned = df.values - d @ pinv(d) @ df.values assert_allclose(1 + np.abs(demeaned.values2d), 1 + np.abs(dummy_demeaned))
def test_numpy_3d(): n, t, k = 11, 7, 3 x = np.random.random((k, t, n)) dh = PanelData(x) assert_equal(x, dh.values3d) assert dh.nentity == n assert dh.nobs == t assert dh.nvar == k assert_equal(np.reshape(x.T, (n * t, k)), dh.values2d) items = ['entity.{0}'.format(i) for i in range(n)] obs = [i for i in range(t)] var_names = ['x.{0}'.format(i) for i in range(k)] expected_frame = panel_to_frame(np.reshape(x, (k, t, n)), items=var_names, major_axis=obs, minor_axis=items, swap=True) expected_frame.index.levels[0].name = 'entity' expected_frame.index.levels[1].name = 'time' assert_frame_equal(dh.dataframe, expected_frame)
def __init__(self, x, var_name='x', convert_dummies=True, drop_first=True, copy=True): self._var_name = var_name self._convert_dummies = convert_dummies self._drop_first = drop_first self._panel = None self._shape = None index_names = ['entity', 'time'] if isinstance(x, PanelData): x = x.dataframe self._original = x if not isinstance(x, (Series, DataFrame, Panel, np.ndarray)): try: from xarray import DataArray if isinstance(x, DataArray): if x.ndim not in (2, 3): raise ValueError( 'Only 2-d or 3-d DataArrays are supported') if x.ndim == 2: x = x.to_pandas() else: items = x.coords[x.dims[0]].values.tolist() major = x.coords[x.dims[1]].values.tolist() minor = x.coords[x.dims[2]].values.tolist() values = x.values x = panel_to_frame(values, items, major, minor, True) except ImportError: pass if isinstance(x, Series) and isinstance(x.index, MultiIndex): x = DataFrame(x) elif isinstance(x, Series): raise ValueError( 'Series can only be used with a 2-level MultiIndex') if isinstance(x, (Panel, DataFrame)): if isinstance(x, DataFrame): if isinstance(x.index, MultiIndex): if len(x.index.levels) != 2: raise ValueError('DataFrame input must have a ' 'MultiIndex with 2 levels') if isinstance(self._original, (DataFrame, PanelData, Series)): for i in range(2): index_names[ i] = x.index.levels[i].name or index_names[i] self._frame = x if copy: self._frame = self._frame.copy() else: self._frame = DataFrame( {var_name: x.T.stack(dropna=False)}) else: self._frame = x.swapaxes(1, 2).to_frame(filter_observations=False) elif isinstance(x, np.ndarray): if x.ndim not in (2, 3): raise ValueError('2 or 3-d array required for numpy input') if x.ndim == 2: x = x[None, :, :] k, t, n = x.shape var_str = var_name + '.{0:0>' + str(int(np.log10(k) + .01)) + '}' variables = [var_name] if k == 1 else [ var_str.format(i) for i in range(k) ] entity_str = 'entity.{0:0>' + str(int(np.log10(n) + .01)) + '}' entities = [entity_str.format(i) for i in range(n)] time = list(range(t)) x = x.astype(np.float64, copy=False) panel = _Panel.from_array(x, items=variables, major_axis=time, minor_axis=entities) self._fake_panel = panel self._frame = panel.to_frame() else: raise TypeError('Only ndarrays, DataFrames, Panels or DataArrays ' 'are supported') if convert_dummies: self._frame = expand_categoricals(self._frame, drop_first) self._frame = self._frame.astype(np.float64, copy=False) time_index = Series(self._frame.index.levels[1]) if not (is_numeric_dtype(time_index.dtype) or is_datetime64_any_dtype(time_index.dtype)): raise ValueError('The index on the time dimension must be either ' 'numeric or date-like') # self._k, self._t, self._n = self.panel.shape self._k, self._t, self._n = self.shape levels = self._frame.index.levels for i in range(2): levels[i].name = index_names[i]
def __init__( self, x: ArrayLike, var_name: str = "x", convert_dummies: bool = True, drop_first: bool = True, copy: bool = True, ): self._var_name = var_name self._convert_dummies = convert_dummies self._drop_first = drop_first self._panel: Optional[_Panel] = None self._shape: Optional[Tuple[int, int, int]] = None index_names = ["entity", "time"] if isinstance(x, PanelData): x = x.dataframe self._original = x if not isinstance(x, (Series, DataFrame, np.ndarray)): try: from xarray import DataArray if isinstance(x, DataArray): if x.ndim not in (2, 3): raise ValueError( "Only 2-d or 3-d DataArrays are supported") if x.ndim == 2: x = x.to_pandas() else: items: List[Hashable] = x.coords[ x.dims[0]].values.tolist() major: List[Hashable] = x.coords[ x.dims[1]].values.tolist() minor: List[Hashable] = x.coords[ x.dims[2]].values.tolist() values = x.values x = panel_to_frame(values, items, major, minor, True) except ImportError: pass if isinstance(x, Series) and isinstance(x.index, MultiIndex): x = DataFrame(x) elif isinstance(x, Series): raise ValueError( "Series can only be used with a 2-level MultiIndex") if isinstance(x, DataFrame): if isinstance(x, DataFrame): if isinstance(x.index, MultiIndex): if len(x.index.levels) != 2: raise ValueError("DataFrame input must have a " "MultiIndex with 2 levels") if isinstance(self._original, (DataFrame, PanelData, Series)): for i in range(2): index_names[ i] = x.index.levels[i].name or index_names[i] self._frame = x if copy: self._frame = self._frame.copy() else: self._frame = DataFrame( {var_name: x.T.stack(dropna=False)}) else: self._frame = x.swapaxes(1, 2).to_frame(filter_observations=False) elif isinstance(x, np.ndarray): if x.ndim not in (2, 3): raise ValueError("2 or 3-d array required for numpy input") if x.ndim == 2: x = x[None, :, :] k, t, n = x.shape var_str = var_name + ".{0:0>" + str(int(np.log10(k) + 0.01)) + "}" variables = [var_name] if k == 1 else [ var_str.format(i) for i in range(k) ] entity_str = "entity.{0:0>" + str(int(np.log10(n) + 0.01)) + "}" entities = [entity_str.format(i) for i in range(n)] time = list(range(t)) x = x.astype(np.float64, copy=False) panel = _Panel.from_array(x, items=variables, major_axis=time, minor_axis=entities) self._fake_panel = panel self._frame = panel.to_frame() else: raise TypeError("Only ndarrays, DataFrames or DataArrays are " "supported") if convert_dummies: self._frame = expand_categoricals(self._frame, drop_first) self._frame = self._frame.astype(np.float64, copy=False) time_index = Series(self._frame.index.levels[1]) if not (is_numeric_dtype(time_index.dtype) or is_datetime64_any_dtype(time_index.dtype)): raise ValueError("The index on the time dimension must be either " "numeric or date-like") # self._k, self._t, self._n = self.panel.shape self._k, self._t, self._n = self.shape self._frame.index.set_names(index_names, inplace=True)
def generate_data(missing, datatype, const=False, ntk=(971, 7, 5), other_effects=0, rng=None, num_cats=4): if rng is None: np.random.seed(12345) else: np.random.set_state(rng.get_state()) n, t, k = ntk k += const x = standard_normal((k, t, n)) beta = np.arange(1, k + 1)[:, None, None] / k y = (x * beta).sum(0) + standard_normal((t, n)) + 2 * standard_normal( (1, n)) w = np.random.chisquare(5, (t, n)) / 5 c = None if other_effects == 1: cats = ['Industries'] else: cats = ['cat.' + str(i) for i in range(other_effects)] if other_effects: if not isinstance(num_cats, list): num_cats = [num_cats] * other_effects c = [] for i in range(other_effects): nc = num_cats[i] c.append(np.random.randint(0, nc, (1, t, n))) c = np.concatenate(c, 0) vcats = ['varcat.' + str(i) for i in range(2)] vc2 = np.ones((2, t, 1)) @ np.random.randint(0, n // 2, (2, 1, n)) vc1 = vc2[[0]] if const: x[0] = 1.0 if missing > 0: locs = np.random.choice(n * t, int(n * t * missing)) y.flat[locs] = np.nan locs = np.random.choice(n * t * k, int(n * t * k * missing)) x.flat[locs] = np.nan if datatype in ('pandas', 'xarray'): entities = ['firm' + str(i) for i in range(n)] time = date_range('1-1-1900', periods=t, freq='A-DEC') var_names = ['x' + str(i) for i in range(k)] # y = DataFrame(y, index=time, columns=entities) y = panel_to_frame(y[None], items=['y'], major_axis=time, minor_axis=entities, swap=True) w = panel_to_frame(w[None], items=['w'], major_axis=time, minor_axis=entities, swap=True) w = w.reindex(y.index) x = panel_to_frame(x, items=var_names, major_axis=time, minor_axis=entities, swap=True) x = x.reindex(y.index) c = panel_to_frame(c, items=cats, major_axis=time, minor_axis=entities, swap=True) c = c.reindex(y.index) vc1 = panel_to_frame(vc1, items=vcats[:1], major_axis=time, minor_axis=entities, swap=True) vc1 = vc1.reindex(y.index) vc2 = panel_to_frame(vc2, items=vcats, major_axis=time, minor_axis=entities, swap=True) vc2 = vc2.reindex(y.index) if datatype == 'xarray': # TODO: This is broken now, need to transfor multiindex to xarray 3d import xarray as xr x = xr.DataArray(PanelData(x).values3d, coords={ 'entities': entities, 'time': time, 'vars': var_names }, dims=['vars', 'time', 'entities']) y = xr.DataArray(PanelData(y).values3d, coords={ 'entities': entities, 'time': time, 'vars': ['y'] }, dims=['vars', 'time', 'entities']) w = xr.DataArray(PanelData(w).values3d, coords={ 'entities': entities, 'time': time, 'vars': ['w'] }, dims=['vars', 'time', 'entities']) if c.shape[1] > 0: c = xr.DataArray(PanelData(c).values3d, coords={ 'entities': entities, 'time': time, 'vars': c.columns }, dims=['vars', 'time', 'entities']) vc1 = xr.DataArray(PanelData(vc1).values3d, coords={ 'entities': entities, 'time': time, 'vars': vc1.columns }, dims=['vars', 'time', 'entities']) vc2 = xr.DataArray(PanelData(vc2).values3d, coords={ 'entities': entities, 'time': time, 'vars': vc2.columns }, dims=['vars', 'time', 'entities']) if rng is not None: rng.set_state(np.random.get_state()) return AttrDict(y=y, x=x, w=w, c=c, vc1=vc1, vc2=vc2)
x[0, :, :] = 1 beta = np.arange(1, k + 2) / (k + 1) eps = np.random.randn(t, n) beta.shape = (k + 1, 1, 1) y = (beta * x).sum(0) + eps y += np.random.randn(1, n) w = np.random.chisquare(10, size=(1, n)) / 10.0 w = np.ones((t, 1)) @ w w = w / w.mean() items = ['x' + str(i) for i in range(1, k + 1)] items = ['intercept'] + items major = pd.date_range('12-31-1999', periods=t, freq='A-DEC') minor = ['firm.' + str(i) for i in range(1, n + 1)] x = panel_to_frame(x, items, major, minor, swap=True) y = panel_to_frame(y[None, :], ['y'], major, minor, swap=True) w = panel_to_frame(w[None, :], ['w'], major, minor, swap=True) x = PanelData(x) y = PanelData(y) w = PanelData(w) z = concat([x.dataframe, y.dataframe, w.dataframe], 1) final_index = pd.MultiIndex.from_product([minor, major]) final_index.levels[0].name = 'firm' z = z.reindex(final_index) z.index.levels[0].name = 'firm' z.index.levels[1].name = 'time' z = z.reset_index()
def generate_panel_data( nentity: int = 971, ntime: int = 7, nexog: int = 5, const: bool = False, missing: float = 0, other_effects: int = 2, ncats: Union[int, List[int]] = 4, rng: Optional[np.random.RandomState] = None, ) -> PanelModelData: """ Parameters ---------- nentity : int, default 971 The number of entities in the panel. ntime : int, default 7 The number of time periods in the panel. nexog : int, default 5 The number of explanatory variables in the dataset. const : bool, default False Flag indicating that the model should include a constant. missing : float, default 0 The percentage of values that are missing. Should be between 0 and 100. other_effects : int, default 2 The number of other effects generated. ncats : Union[int, Sequence[int]], default 4 The number of categories to use in other_effects and variance clusters. If list-like, then it must have as many elements as other_effects. rng : RandomState, default None A NumPy RandomState instance. If not provided, one is initialized using a fixed seed. Returns ------- PanelModelData A namedtuple derived class containing 4 DataFrames: * `data` - A simulated data with variables y and x# for # in 0,...,4. If const is True, then also contains a column named const. * `weights` - Simulated non-negative weights. * `other_effects` - Simulated effects. * `clusters` - Simulated data to use in clustered covariance estimation. """ if rng is None: rng = np.random.RandomState( [ 0xA14E2429, 0x448D2E51, 0x91B558E7, 0x6A3F5CD2, 0x22B43ABB, 0xE746C92D, 0xCE691A7D, 0x66746EE7, ] ) n, t, k = nentity, ntime, nexog k += int(const) x = rng.standard_normal((k, t, n)) beta = np.arange(1, k + 1)[:, None, None] / k y = ( (x * beta).sum(0) + rng.standard_normal((t, n)) + 2 * rng.standard_normal((1, n)) ) w = rng.chisquare(5, (t, n)) / 5 c = None cats = [f"cat.{i}" for i in range(other_effects)] if other_effects: if not isinstance(ncats, list): ncats = [ncats] * other_effects c = [] for i in range(other_effects): nc = ncats[i] c.append(rng.randint(0, nc, (1, t, n))) c = np.concatenate(c, 0) vcats = [f"varcat.{i}" for i in range(2)] vc2 = np.ones((2, t, 1)) @ rng.randint(0, n // 2, (2, 1, n)) vc1 = vc2[[0]] if const: x[0] = 1.0 if missing > 0: locs = rng.choice(n * t, int(n * t * missing)) y.flat[locs] = np.nan locs = rng.choice(n * t * k, int(n * t * k * missing)) x.flat[locs] = np.nan entities = [f"firm{i}" for i in range(n)] time = date_range("1-1-1900", periods=t, freq="A-DEC") var_names = [f"x{i}" for i in range(k)] if const: var_names[1:] = var_names[:-1] var_names[0] = "const" # y = DataFrame(y, index=time, columns=entities) y_df = panel_to_frame( y[None], items=["y"], major_axis=time, minor_axis=entities, swap=True ) index = y_df.index w_df = panel_to_frame( w[None], items=["w"], major_axis=time, minor_axis=entities, swap=True ) w_df = w_df.reindex(index) x_df = panel_to_frame( x, items=var_names, major_axis=time, minor_axis=entities, swap=True ) x_df = x_df.reindex(index) c_df = panel_to_frame( c, items=cats, major_axis=time, minor_axis=entities, swap=True ) other_eff = c_df.reindex(index) vc1_df = panel_to_frame( vc1, items=vcats[:1], major_axis=time, minor_axis=entities, swap=True ) vc1_df = vc1_df.reindex(index) vc2_df = panel_to_frame( vc2, items=vcats, major_axis=time, minor_axis=entities, swap=True ) vc2_df = vc2_df.reindex(index) clusters = concat([vc1_df, vc2_df]) data = concat([y_df, x_df], axis=1) return PanelModelData(data, w_df, other_eff, clusters)