def test_labels(panel): dh = PanelData(panel) assert dh.vars == list(panel.items) assert dh.time == list(panel.major_axis) assert dh.entities == list(panel.minor_axis)
def test_labels(mi_df): dh = PanelData(mi_df) assert dh.vars == list(mi_df.columns) assert dh.time == list(mi_df.index.levels[1]) assert dh.entities == list(mi_df.index.get_level_values(0).unique())
a = x.T @ p @ x b = (x.T @ z) @ (x.T @ z).T a b np.linalg.inv(a) @ b np.trace(np.linalg.inv(a) @ b) 30 30 data = generate_data(0, 'pandas', ntk=(101, 3, 5), other_effects=1, const=False) y = PanelData(data.y) x = PanelData(data.x) w = PanelData(data.w) x.dataframe.iloc[:, 0] = 1 mod = PanelOLS(data.y, data.x, weights=data.w) mod.fit() mod = PanelOLS(y, x, weights=data.w, entity_effect=True) mod.fit() mod = PanelOLS(data.y, data.x, weights=data.w, time_effect=True) mod.fit() mod = PanelOLS(data.y, data.x, weights=data.w, time_effect=True, entity_effect=True)
def test_first_difference(data): x = PanelData(data.x) x.first_difference()
def test_numpy_1d(): n = 11 x = np.random.random(n) with pytest.raises(ValueError): PanelData(x)
def test_demean_invalid(panel): data = PanelData(panel) with pytest.raises(ValueError): data.demean('unknown')
def test_series_multiindex(panel): mi = panel.swapaxes(1, 2).to_frame(filter_observations=False) from_df = PanelData(mi.iloc[:, [0]]) from_series = PanelData(mi.iloc[:, 0]) assert_frame_equal(from_df.dataframe, from_series.dataframe)
def test_series_multiindex(mi_df): from_df = PanelData(mi_df.iloc[:, [0]]) from_series = PanelData(mi_df.iloc[:, 0]) assert_frame_equal(from_df.dataframe, from_series.dataframe)
def test_invalid_seires(mi_df): si = mi_df.reset_index() with pytest.raises(ValueError): PanelData(si.iloc[:, 0])
def generate_data(missing, datatype, const=False, ntk=(971, 7, 5), other_effects=0, rng=None): if rng is None: np.random.seed(12345) else: np.random.set_state(rng.get_state()) n, t, k = ntk k += const x = standard_normal((k, t, n)) beta = np.arange(1, k + 1)[:, None, None] / k y = (x * beta).sum(0) + standard_normal((t, n)) + 2 * standard_normal((1, n)) w = np.random.chisquare(5, (t, n)) / 5 c = None if other_effects == 1: cats = ['Industries'] else: cats = ['cat.' + str(i) for i in range(other_effects)] if other_effects: c = np.random.randint(0, 4, (other_effects, t, n)) vcats = ['varcat.' + str(i) for i in range(2)] vc2 = np.ones((2, t, 1)) @ np.random.randint(0, n // 2, (2, 1, n)) vc1 = vc2[[0]] if const: x[0] = 1.0 if missing > 0: locs = np.random.choice(n * t, int(n * t * missing)) y.flat[locs] = np.nan locs = np.random.choice(n * t * k, int(n * t * k * missing)) x.flat[locs] = np.nan if datatype in ('pandas', 'xarray'): entities = ['firm' + str(i) for i in range(n)] time = pd.date_range('1-1-1900', periods=t, freq='A-DEC') var_names = ['x' + str(i) for i in range(k)] # y = pd.DataFrame(y, index=time, columns=entities) y = panel_to_frame(y[None], items=['y'], major_axis=time, minor_axis=entities, swap=True) w = panel_to_frame(w[None], items=['w'], major_axis=time, minor_axis=entities, swap=True) w = w.reindex(y.index) x = panel_to_frame(x, items=var_names, major_axis=time, minor_axis=entities, swap=True) x = x.reindex(y.index) c = panel_to_frame(c, items=cats, major_axis=time, minor_axis=entities, swap=True) c = c.reindex(y.index) vc1 = panel_to_frame(vc1, items=vcats[:1], major_axis=time, minor_axis=entities, swap=True) vc1 = vc1.reindex(y.index) vc2 = panel_to_frame(vc2, items=vcats, major_axis=time, minor_axis=entities, swap=True) vc2 = vc2.reindex(y.index) if datatype == 'xarray': # TODO: This is broken now, need to transfor multiindex to xarray 3d import xarray as xr x = xr.DataArray(PanelData(x).values3d, coords={'entities': entities, 'time': time, 'vars': var_names}, dims=['vars', 'time', 'entities']) y = xr.DataArray(PanelData(y).values3d, coords={'entities': entities, 'time': time, 'vars': ['y']}, dims=['vars', 'time', 'entities']) w = xr.DataArray(PanelData(w).values3d, coords={'entities': entities, 'time': time, 'vars': ['w']}, dims=['vars', 'time', 'entities']) if c.shape[1] > 0: c = xr.DataArray(PanelData(c).values3d, coords={'entities': entities, 'time': time, 'vars': c.columns}, dims=['vars', 'time', 'entities']) vc1 = xr.DataArray(PanelData(vc1).values3d, coords={'entities': entities, 'time': time, 'vars': vc1.columns}, dims=['vars', 'time', 'entities']) vc2 = xr.DataArray(PanelData(vc2).values3d, coords={'entities': entities, 'time': time, 'vars': vc2.columns}, dims=['vars', 'time', 'entities']) if rng is not None: rng.set_state(np.random.get_state()) return AttrDict(y=y, x=x, w=w, c=c, vc1=vc1, vc2=vc2)
def test_incorrect_types_xarray(): with pytest.raises(ValueError): PanelData(xr.DataArray(np.random.randn(10)))
std_errs = {} std_errs_no = {} std_errs_u = {} std_errs_u_no = {} std_errs_r = {} std_errs_r_no = {} vals = np.zeros((NUM_REPS, 5, 7)) for b in range(NUM_REPS): if b % 25 == 0: print(key, n, b) data = generate_data(0.00, 'pandas', ntk=(n, 3, 5), other_effects=1, const=False, rng=rs) mo, fo = options[key] mod_type, cluster_type = key.split(':') y = PanelData(data.y) random_effects = np.random.randint(0, n // 3, size=y.dataframe.shape) other_random = np.random.randint(0, n // 5, size=y.dataframe.shape) if mod_type == 'random': effects = y.copy() effects.dataframe.iloc[:, :] = random_effects mo['other_effects'] = effects if cluster_type in ('random', 'other-random', 'entity-nested', 'random-nested'): clusters = y.copy() if cluster_type == 'random': clusters.dataframe.iloc[:, :] = random_effects elif cluster_type == 'other-random': clusters.dataframe.iloc[:, :] = other_random elif cluster_type == 'entity_nested':
import pytest from linearmodels.panel.data import PanelData from linearmodels.shared.typed_getters import ( get_array_like, get_bool, get_float, get_panel_data_like, get_string, ) ARRAY_LIKE: Tuple[Type, ...] = (np.ndarray, pd.Series, pd.DataFrame) PANEL_LIKE: Tuple[Type, ...] = ARRAY_LIKE + (PanelData, ) ARRAYS: Tuple[Any, ...] = (np.array([1.0]), pd.Series([1.0]), pd.DataFrame([[1.0]])) PANELS: Tuple[Any, ...] = ARRAYS + (PanelData(np.array([[[1.0]]])), ) try: import xarray as xr ARRAY_LIKE += (xr.DataArray, ) PANEL_LIKE += (xr.DataArray, ) ARRAYS += (xr.DataArray(ARRAYS[0]), ) PANELS += (xr.DataArray(ARRAYS[0]), ) except ImportError: pass @pytest.fixture(params=ARRAYS) def arr(request): return request.param
複数年の場合。 df.reset_index().query('year in [2000,2002]') 上と同じ結果。 df.reset_index().query('year not in [2001]') ## `linearmodels`の`PanelData` `linearmodels`では`MultiIndex`化された`DataFrame`をそのまま読み込み推定することができる。一方で,`linearmodels`の関数`PanelData`を使い`MultiIndex`化された`DataFrame`を`PanelData`オブジェクトに変換すると分析に必要な計算を簡単にできるようになる。必須ではないが,知っていて損はしない関数である。 まず`df`を`PanelData`オブジェクトに変換する。 dfp = PanelData(df) dfp --- 属性`shape`は,`PanelData`の変数の数を表示する。以下が返り値の内容である。 $$ \left(\text{変数の数},\text{期間数},\text{観察単位の数}\right) $$ dfp.shape * 変数の数:4(列にある変数) * 期間数:3(年) * 観察単位の数:3(国)
def test_missing(panel): panel.iloc[0, :, ::3] = np.nan dh = PanelData(panel) assert_equal(dh.isnull, np.any(np.isnan(dh.values2d), 1))
def test_repr_html(mi_df): data = PanelData(mi_df) html = data._repr_html_() assert '<br/>' in html
def test_str_repr(panel): data = PanelData(panel) assert 'PanelData' in str(data) assert str(hex(id(data))) in data.__repr__()
def generate_data( missing: bool, datatype: Literal["pandas", "xarray", "numpy"], const: bool = False, ntk: tuple[int, int, int] = (971, 7, 5), other_effects: int = 0, rng: RandomState | None = None, num_cats: int | list[int] = 4, ): if rng is None: np.random.seed(12345) else: np.random.set_state(rng.get_state()) n, t, k = ntk k += const x = standard_normal((k, t, n)) beta = np.arange(1, k + 1)[:, None, None] / k y = np.empty((t, n), dtype=np.float64) y[:, :] = (x * beta).sum(0) + standard_normal( (t, n)) + 2 * standard_normal((1, n)) w = np.random.chisquare(5, (t, n)) / 5 c = np.empty((y.size, 0), dtype=int) if other_effects == 1: cats = ["Industries"] else: cats = ["cat." + str(i) for i in range(other_effects)] if other_effects: if isinstance(num_cats, int): num_cats = [num_cats] * other_effects oe = [] for i in range(other_effects): nc = num_cats[i] oe.append(np.random.randint(0, nc, (1, t, n))) c = np.concatenate(oe, 0) vcats = ["varcat." + str(i) for i in range(2)] vc2 = np.ones((2, t, 1)) @ np.random.randint(0, n // 2, (2, 1, n)) vc1 = vc2[[0]] if const: x[0] = 1.0 if missing > 0: locs = np.random.choice(n * t, int(n * t * missing)) y.flat[locs] = np.nan locs = np.random.choice(n * t * k, int(n * t * k * missing)) x.flat[locs] = np.nan if rng is not None: rng.set_state(np.random.get_state()) if datatype == "numpy": return AttrDict(y=y, x=x, w=w, c=c, vc1=vc1, vc2=vc2) entities = ["firm" + str(i) for i in range(n)] time = date_range("1-1-1900", periods=t, freq="A-DEC") var_names = ["x" + str(i) for i in range(k)] # y = DataFrame(y, index=time, columns=entities) y_df = panel_to_frame(y[None], items=["y"], major_axis=time, minor_axis=entities, swap=True) w_df = panel_to_frame(w[None], items=["w"], major_axis=time, minor_axis=entities, swap=True) w_df = w_df.reindex(y_df.index) x_df = panel_to_frame(x, items=var_names, major_axis=time, minor_axis=entities, swap=True) x_df = x_df.reindex(y_df.index) if c.shape[1]: c_df = panel_to_frame(c, items=cats, major_axis=time, minor_axis=entities, swap=True) else: c_df = DataFrame(index=y_df.index) c_df = c_df.reindex(y_df.index) vc1_df = panel_to_frame(vc1, items=vcats[:1], major_axis=time, minor_axis=entities, swap=True) vc1_df = vc1_df.reindex(y_df.index) vc2_df = panel_to_frame(vc2, items=vcats, major_axis=time, minor_axis=entities, swap=True) vc2_df = vc2_df.reindex(y_df.index) if datatype == "pandas": return AttrDict(y=y_df, x=x_df, w=w_df, c=c_df, vc1=vc1_df, vc2=vc2_df) assert datatype == "xarray" import xarray as xr from xarray.core.dtypes import NA x_xr = xr.DataArray( PanelData(x_df).values3d, coords={ "entities": entities, "time": time, "vars": var_names }, dims=["vars", "time", "entities"], ) y_xr = xr.DataArray( PanelData(y_df).values3d, coords={ "entities": entities, "time": time, "vars": ["y"] }, dims=["vars", "time", "entities"], ) w_xr = xr.DataArray( PanelData(w_df).values3d, coords={ "entities": entities, "time": time, "vars": ["w"] }, dims=["vars", "time", "entities"], ) c_vals = PanelData(c_df).values3d if c.shape[1] else NA c_xr = xr.DataArray( c_vals, coords={ "entities": entities, "time": time, "vars": c_df.columns }, dims=["vars", "time", "entities"], ) vc1_xr = xr.DataArray( PanelData(vc1_df).values3d, coords={ "entities": entities, "time": time, "vars": vc1_df.columns }, dims=["vars", "time", "entities"], ) vc2_xr = xr.DataArray( PanelData(vc2_df).values3d, coords={ "entities": entities, "time": time, "vars": vc2_df.columns }, dims=["vars", "time", "entities"], ) return AttrDict(y=y_xr, x=x_xr, w=w_xr, c=c_xr, vc1=vc1_xr, vc2=vc2_xr)
def test_roundtrip_3d(data): x = data.x xpd = PanelData(x) xv = x if isinstance(x, np.ndarray) else x.values assert_equal(xpd.values3d, xv)
y = (beta * x).sum(0) + eps y += np.random.randn(1, n) w = np.random.chisquare(10, size=(1, n)) / 10.0 w = np.ones((t, 1)) @ w w = w / w.mean() items = ["x" + str(i) for i in range(1, k + 1)] items = ["intercept"] + items major = pd.date_range("12-31-1999", periods=t, freq="A-DEC") minor = ["firm." + str(i) for i in range(1, n + 1)] x = panel_to_frame(x, items, major, minor, swap=True) y = panel_to_frame(y[None, :], ["y"], major, minor, swap=True) w = panel_to_frame(w[None, :], ["w"], major, minor, swap=True) x = PanelData(x) y = PanelData(y) w = PanelData(w) z = concat([x.dataframe, y.dataframe, w.dataframe], 1) final_index = pd.MultiIndex.from_product([minor, major]) final_index.levels[0].name = "firm" z = z.reindex(final_index) z.index.levels[0].name = "firm" z.index.levels[1].name = "time" z = z.reset_index() z["firm_id"] = z.firm.astype("category") z["firm_id"] = z.firm_id.cat.codes vars = ["y", "x1", "x2", "x3", "x4", "x5"]
def test_invalid_seires(panel): si = panel.to_frame().reset_index() with pytest.raises(ValueError): PanelData(si.iloc[:, 0])
def generate_data( missing, datatype, const=False, ntk=(971, 7, 5), other_effects=0, rng=None, num_cats=4, ): if rng is None: np.random.seed(12345) else: np.random.set_state(rng.get_state()) n, t, k = ntk k += const x = standard_normal((k, t, n)) beta = np.arange(1, k + 1)[:, None, None] / k y = (x * beta).sum(0) + standard_normal((t, n)) + 2 * standard_normal( (1, n)) w = np.random.chisquare(5, (t, n)) / 5 c = None if other_effects == 1: cats = ["Industries"] else: cats = ["cat." + str(i) for i in range(other_effects)] if other_effects: if not isinstance(num_cats, list): num_cats = [num_cats] * other_effects c = [] for i in range(other_effects): nc = num_cats[i] c.append(np.random.randint(0, nc, (1, t, n))) c = np.concatenate(c, 0) vcats = ["varcat." + str(i) for i in range(2)] vc2 = np.ones((2, t, 1)) @ np.random.randint(0, n // 2, (2, 1, n)) vc1 = vc2[[0]] if const: x[0] = 1.0 if missing > 0: locs = np.random.choice(n * t, int(n * t * missing)) y.flat[locs] = np.nan locs = np.random.choice(n * t * k, int(n * t * k * missing)) x.flat[locs] = np.nan if datatype in ("pandas", "xarray"): entities = ["firm" + str(i) for i in range(n)] time = date_range("1-1-1900", periods=t, freq="A-DEC") var_names = ["x" + str(i) for i in range(k)] # y = DataFrame(y, index=time, columns=entities) y = panel_to_frame(y[None], items=["y"], major_axis=time, minor_axis=entities, swap=True) w = panel_to_frame(w[None], items=["w"], major_axis=time, minor_axis=entities, swap=True) w = w.reindex(y.index) x = panel_to_frame(x, items=var_names, major_axis=time, minor_axis=entities, swap=True) x = x.reindex(y.index) c = panel_to_frame(c, items=cats, major_axis=time, minor_axis=entities, swap=True) c = c.reindex(y.index) vc1 = panel_to_frame(vc1, items=vcats[:1], major_axis=time, minor_axis=entities, swap=True) vc1 = vc1.reindex(y.index) vc2 = panel_to_frame(vc2, items=vcats, major_axis=time, minor_axis=entities, swap=True) vc2 = vc2.reindex(y.index) if datatype == "xarray": # TODO: This is broken now, need to transform MultiIndex to xarray 3d import xarray as xr x = xr.DataArray( PanelData(x).values3d, coords={ "entities": entities, "time": time, "vars": var_names }, dims=["vars", "time", "entities"], ) y = xr.DataArray( PanelData(y).values3d, coords={ "entities": entities, "time": time, "vars": ["y"] }, dims=["vars", "time", "entities"], ) w = xr.DataArray( PanelData(w).values3d, coords={ "entities": entities, "time": time, "vars": ["w"] }, dims=["vars", "time", "entities"], ) if c.shape[1] > 0: c = xr.DataArray( PanelData(c).values3d, coords={ "entities": entities, "time": time, "vars": c.columns }, dims=["vars", "time", "entities"], ) vc1 = xr.DataArray( PanelData(vc1).values3d, coords={ "entities": entities, "time": time, "vars": vc1.columns }, dims=["vars", "time", "entities"], ) vc2 = xr.DataArray( PanelData(vc2).values3d, coords={ "entities": entities, "time": time, "vars": vc2.columns }, dims=["vars", "time", "entities"], ) if rng is not None: rng.set_state(np.random.get_state()) return AttrDict(y=y, x=x, w=w, c=c, vc1=vc1, vc2=vc2)
def test_repr_html(panel): data = PanelData(panel) html = data._repr_html_() assert '<br/>' in html
def test_valid_weight_shape(data): # Same size n = np.prod(data.y.shape) weights = 1 + np.random.random_sample(n) mod = PanelOLS(data.y, data.x, weights=weights) mod.fit() w = mod.weights.values2d missing = PanelData(data.y).isnull | PanelData(data.x).isnull expected = weights[~missing.squeeze()][:, None] expected = expected / expected.mean() assert_equal(w, expected) # Per time if isinstance(data.x, pd.DataFrame): n = len(data.y.index.levels[1]) k = len(data.y.index.levels[0]) elif isinstance(data.x, np.ndarray): n = data.y.shape[0] k = data.y.shape[1] else: n = data.y.shape[1] k = data.y.shape[2] weights = 1 + np.random.random_sample(n) mod = PanelOLS(data.y, data.x, weights=weights) mod.fit() w = mod.weights.values2d expected = weights[:, None] @ np.ones((1, k)) expected = expected.T.ravel() expected = expected[~missing.squeeze()][:, None] expected = expected / expected.mean() assert_equal(w, expected) # Per entity if isinstance(data.x, pd.DataFrame): n = len(data.y.index.levels[0]) k = len(data.y.index.levels[1]) elif isinstance(data.x, np.ndarray): n = data.y.shape[1] k = data.y.shape[0] else: n = data.y.shape[2] k = data.y.shape[1] weights = 1 + np.random.random_sample(n) mod = PanelOLS(data.y, data.x, weights=weights) mod.fit() w = mod.weights.values2d expected = np.ones((k, 1)) @ weights[None, :] expected = expected.T.ravel() expected = expected[~missing.squeeze()][:, None] expected = expected / expected.mean() assert_equal(w, expected) weights = 1 + np.random.random_sample(data.y.shape) mod = PanelOLS(data.y, data.x, weights=weights) mod.fit() w = mod.weights.values2d expected = weights.T.ravel() expected = expected[~missing.squeeze()][:, None] expected = expected / expected.mean() assert_equal(w, expected)
def test_dimensions(mi_df): dh = PanelData(mi_df) assert dh.nentity == len(mi_df.index.levels[0]) assert dh.nvar == mi_df.shape[1] assert dh.nobs == len(mi_df.index.levels[1])
def test_dimensions(panel): dh = PanelData(panel) assert dh.nentity == panel.shape[2] assert dh.nvar == panel.shape[0] assert dh.nobs == panel.shape[1]
def test_incorrect_types(): with pytest.raises(TypeError): PanelData(list(np.random.randn(10)))
y = (beta * x).sum(0) + eps y += np.random.randn(1, n) w = np.random.chisquare(10, size=(1, n)) / 10.0 w = np.ones((t, 1)) @ w w = w / float(w.mean()) items = ["x" + str(i) for i in range(1, k + 1)] items = ["intercept"] + items major = pd.date_range("12-31-1999", periods=t, freq="A-DEC") minor = ["firm." + str(i) for i in range(1, n + 1)] x = panel_to_frame(x, items, major, minor, swap=True) y = panel_to_frame(y[None, :], ["y"], major, minor, swap=True) w = panel_to_frame(w[None, :], ["w"], major, minor, swap=True) x_panel_data = PanelData(x) y_panel_data = PanelData(y) w_panel_data = PanelData(w) z = pd.concat( [x_panel_data.dataframe, y_panel_data.dataframe, w_panel_data.dataframe], 1, sort=False, ) final_index = pd.MultiIndex.from_product([minor, major]) final_index.levels[0].name = "firm" z = z.reindex(final_index) z.index.levels[0].name = "firm" z.index.levels[1].name = "time" z = z.reset_index()