def test_emstep_methods_missing(matlab_results, k_factors, factor_orders, factor_multiplicities, idiosyncratic_ar1): # Test that in the case of missing data, the direct and optimized EM step # methods for the observation equation give identical results across a # variety of parameterizations endog_M = matlab_results[0].iloc[:, :10] endog_Q = matlab_results[1].iloc[:, :10] # Construct the model mod = dynamic_factor_mq.DynamicFactorMQ( endog_M, endog_quarterly=endog_Q, factors=k_factors, factor_orders=factor_orders, factor_multiplicities=factor_multiplicities, idiosyncratic_ar1=idiosyncratic_ar1, standardize=True) mod.ssm.filter_univariate = True params0 = mod.start_params _, params1 = mod._em_iteration(params0, mstep_method='missing') # Now double-check the observation equation M step for identical H and # Lambda directly mod.update(params1) res = mod.ssm.smooth() a = res.smoothed_state.T[..., None] cov_a = res.smoothed_state_cov.transpose(2, 0, 1) Eaa = cov_a + np.matmul(a, a.transpose(0, 2, 1)) Lambda, H = mod._em_maximization_obs_missing(res, Eaa, a, compute_H=True)
def test_k_factor1(reset_randomstate): # Fitted parameters for np.random.seed(1234) replicate the true parameters # pretty well (flipped signs on loadings are just from usual factor sign # identification issue): # True Fitted # loading.0->0 1.00 -0.98 # loading.0->1 -0.75 0.75 # loading.0)->2 0.25 -0.24 # loading.0->3 -0.30 0.31 # L1.0->0 0.50 0.50 # sigma2.0 10.00 10.07 # sigma2.1 10.00 10.06 # sigma2.2 10.00 9.94 # sigma2.3 10.00 11.60 np.random.seed(1234) endog_M, endog_Q, _, _, true_params, _ = simulate_k_factor1(nobs=100000) mod = dynamic_factor_mq.DynamicFactorMQ(endog_M, endog_quarterly=endog_Q, factors=1, factor_orders=1, idiosyncratic_ar1=False) # Fit the model with L-BFGS. Because the model doesn't impose identifying # assumptions on the factors, here we force identification by fixing the # factor error variance to be unity with mod.fix_params({'fb(0).cov.chol[1,1]': 1.}): mod.fit(method='lbfgs', disp=False)
def test_emstep1(matlab_results, run): # Test that our EM step gets params2 from params1 # Uses our default method for the observation equation, which is an # optimized version of the method presented in Bańbura and Modugno (2014) # (e.g. our version doesn't require the loop over T or the Kronecker # product) endog_M, endog_Q = matlab_results[:2] results1 = matlab_results[2][f'{run}1'] results2 = matlab_results[2][f'{run}2'] # Construct the model mod = dynamic_factor_mq.DynamicFactorMQ( endog_M.iloc[:, :results1['k_endog_M']], endog_quarterly=endog_Q, factors=results1['factors'], factor_orders=results1['factor_orders'], factor_multiplicities=results1['factor_multiplicities'], idiosyncratic_ar1=True, init_t0=True, obs_cov_diag=True, standardize=True) init = initialization.Initialization( mod.k_states, 'known', constant=results1['initial_state'], stationary_cov=results1['initial_state_cov']) res2, params2 = mod._em_iteration(results1['params'], init=init, mstep_method='missing') # Test parameters true2 = results2['params'] assert_allclose(params2[mod._p['loadings']], true2[mod._p['loadings']]) assert_allclose(params2[mod._p['factor_ar']], true2[mod._p['factor_ar']]) assert_allclose(params2[mod._p['factor_cov']], true2[mod._p['factor_cov']]) assert_allclose(params2[mod._p['idiosyncratic_ar1']], true2[mod._p['idiosyncratic_ar1']]) assert_allclose(params2[mod._p['idiosyncratic_var']], true2[mod._p['idiosyncratic_var']])
def test_known(matlab_results, run): endog_M, endog_Q = matlab_results[:2] results = matlab_results[2][run] # Construct the model mod = dynamic_factor_mq.DynamicFactorMQ( endog_M.iloc[:, :results['k_endog_M']], endog_quarterly=endog_Q, factors=results['factors'], factor_orders=results['factor_orders'], factor_multiplicities=results['factor_multiplicities'], idiosyncratic_ar1=True, init_t0=True, obs_cov_diag=True, standardize=True) mod.initialize_known(results['initial_state'], results['initial_state_cov']) res = mod.smooth(results['params'], cov_type='none') assert_allclose(res.llf - mod.loglike_constant, results['llf']) assert_allclose(res.filter_results.smoothed_forecasts.T[1:], results['smoothed_forecasts'][:-1]) assert_allclose( res.forecast(1, original_scale=False).iloc[0], results['smoothed_forecasts'][-1])
def test_news(matlab_results, run): endog_M, endog_Q = matlab_results[:2] results = matlab_results[2][run] updated_M, updated_Q = matlab_results[-2:] # Construct the base model mod1 = dynamic_factor_mq.DynamicFactorMQ( endog_M.iloc[:, :results['k_endog_M']], endog_quarterly=endog_Q, factors=results['factors'], factor_orders=results['factor_orders'], factor_multiplicities=results['factor_multiplicities'], idiosyncratic_ar1=True, init_t0=True, obs_cov_diag=True, standardize=True) mod1.initialize_known(results['initial_state'], results['initial_state_cov']) res1 = mod1.smooth(results['params'], cov_type='none') # Construct the updated model res2 = res1.apply(updated_M.iloc[:, :results['k_endog_M']], endog_quarterly=updated_Q, retain_standardization=True) # Compute the news news = res2.news(res1, impact_date='2016-09', comparison_type='previous') print(news.revision_impacts.loc['2016-09', 'GDPC1']) assert_allclose(news.revision_impacts.loc['2016-09', 'GDPC1'], results['revision_impacts']) columns = ['forecast (prev)', 'observed', 'weight', 'impact'] actual = news.details_by_impact.loc['2016-09', 'GDPC1'][columns] assert_allclose(actual.loc[('2016-06', 'CPIAUCSL')], results['news_table'][0]) assert_allclose(actual.loc[('2016-06', 'UNRATE')], results['news_table'][1]) assert_allclose(actual.loc[('2016-06', 'PAYEMS')], results['news_table'][2]) if mod1.k_endog_M == 6: i = 6 assert_allclose(actual.loc[('2016-06', 'RSAFS')], results['news_table'][3]) assert_allclose(actual.loc[('2016-05', 'TTLCONS')], results['news_table'][4]) assert_allclose(actual.loc[('2016-06', 'TCU')], results['news_table'][5]) else: i = 3 assert_allclose(actual.loc[('2016-06', 'GDPC1')], results['news_table'][i])
def test_emstep_methods_nonmissing(matlab_results, k_factors, factor_orders, factor_multiplicities, idiosyncratic_ar1): # Test that in the case of non-missing data, our three EM step methods for # the observation equation (nonmissing, missing_direct, missing) give # identical results across a variety of parameterizations # Note that including quarterly series will always imply missing values, # so we have to only provide monthly series dta_M = matlab_results[0].iloc[:, :8] dta_M = (dta_M - dta_M.mean()) / dta_M.std() endog_M = dta_M.interpolate().fillna(method='backfill') # Remove the quarterly endog->factor maps if isinstance(k_factors, dict): if 'GDPC1' in k_factors: del k_factors['GDPC1'] if 'ULCNFB' in k_factors: del k_factors['ULCNFB'] # Construct the model mod = dynamic_factor_mq.DynamicFactorMQ( endog_M, factors=k_factors, factor_orders=factor_orders, factor_multiplicities=factor_multiplicities, idiosyncratic_ar1=idiosyncratic_ar1) mod.ssm.filter_univariate = True params0 = mod.start_params _, params1 = mod._em_iteration(params0, mstep_method='missing') _, params1_nonmissing = mod._em_iteration(params0, mstep_method='nonmissing') assert_allclose(params1_nonmissing, params1, atol=1e-13) # Now double-check the observation equation M step for identical H and # Lambda directly mod.update(params1) res = mod.ssm.smooth() a = res.smoothed_state.T[..., None] cov_a = res.smoothed_state_cov.transpose(2, 0, 1) Eaa = cov_a + np.matmul(a, a.transpose(0, 2, 1)) Lambda, H = mod._em_maximization_obs_missing(res, Eaa, a, compute_H=True) Lambda_nonmissing, H_nonmissing = mod._em_maximization_obs_nonmissing( res, Eaa, a, compute_H=True) assert_allclose(Lambda_nonmissing, Lambda, atol=1e-13) assert_allclose(H_nonmissing, H, atol=1e-13)
def test_em_nonstationary(reset_randomstate): # Test that when the EM algorithm estimates non-stationary parameters, that # it warns the user and switches to a diffuse initialization. ix = pd.period_range(start='2000', periods=20, freq='M') endog_M = pd.Series(np.arange(20), index=ix) endog_M.iloc[10:12] += [0.4, -0.2] # add in a little noise ix = pd.period_range(start='2000', periods=5, freq='Q') endog_Q = pd.Series(np.arange(5), index=ix) mod = dynamic_factor_mq.DynamicFactorMQ(endog_M, endog_quarterly=endog_Q, idiosyncratic_ar1=False, standardize=False, factors=['global']) msg = ('Non-stationary parameters found at EM iteration 1, which is not' ' compatible with stationary initialization. Initialization was' r' switched to diffuse for the following: \["factor block:' r' \(\'global\',\)"\], and fitting was restarted.') with pytest.warns(UserWarning, match=msg): return mod.fit(maxiter=2, em_initialization=False)
def test_k_factor1_factor_order_6(reset_randomstate): # This tests that the model is correctly set up when the lag order of the # factor is longer than 5 and we have a single factor. This is important # because 5 lags are always present when there is quarterly data, but we # want to check that, for example, we haven't accidentally relied on there # being exactly 5 lags available. # Note: as of 2020/07/25, the FRBNY code does not seem to work for 6 lags, # so we can't test against their code # Note: the case with only 100 nobs leads to issues with the EM algorithm # and a decrease in the log-likelihood # There is a description of the results from # a run with nobs=10000 that are a better indication of the model finding # the correct parameters. endog_M, endog_Q, _ = gen_k_factor1(nobs=100, idiosyncratic_var=0.0) # Construct and fit the model mod = dynamic_factor_mq.DynamicFactorMQ(endog_M, endog_quarterly=endog_Q, factor_orders=6, idiosyncratic_ar1=False, standardize=False) mod.fit()
def test_two_blocks_factor_orders_6(reset_randomstate): # This tests that the model is correctly set up when the lag order of the # factor is longer than 5 and we have two blocks of factors, one block with # a single factor and one block with two factors. # For the results below, we use nobs=1000, since nobs=10000 takes a very # long time and a large amount of memory. As a result, the results below # are noisier than they could be, although they still provide pretty good # evidence that the model is performing as it should nobs = 1000 idiosyncratic_ar1 = True k1 = 3 k2 = 10 endog1_M, endog1_Q, f1 = gen_k_factor1(nobs, k=k1, idiosyncratic_ar1=idiosyncratic_ar1) endog2_M, endog2_Q, f2 = gen_k_factor2(nobs, k=k2, idiosyncratic_ar1=idiosyncratic_ar1) endog_M = pd.concat([endog1_M, f2, endog2_M], axis=1) endog_Q = pd.concat([endog1_Q, endog2_Q], axis=1) factors = {f'yM{i + 1}_f1': ['a'] for i in range(k1)} factors.update({f'yQ{i + 1}_f1': ['a'] for i in range(k1)}) factors.update({f'f{i + 1}': ['b'] for i in range(2)}) factors.update({f'yM{i + 1}_f2': ['b'] for i in range(k2)}) factors.update({f'yQ{i + 1}_f2': ['b'] for i in range(k2)}) factor_multiplicities = {'b': 2} mod = dynamic_factor_mq.DynamicFactorMQ( endog_M, endog_quarterly=endog_Q, factors=factors, factor_multiplicities=factor_multiplicities, factor_orders=6, idiosyncratic_ar1=idiosyncratic_ar1, standardize=False) mod.fit() # For the 1-factor block: # From one run, the following fitted coefficients were estimated: # True Fitted # loading.a->yM1_f1 1.00 -0.86 # loading.a->yM2_f1 1.00 -0.85 # loading.a->yM3_f1 1.00 -0.86 # loading.a->yQ1_f1 1.00 -0.71 # loading.a->yQ2_f1 1.00 -0.47 # loading.a->yQ3_f1 1.00 -0.48 # L1.a->a 0.00 0.05 # L2.a->a 0.00 0.06 # L3.a->a 0.00 0.04 # L4.a->a 0.00 -0.03 # L5.a->a 0.00 -0.06 # L6.a->a 0.50 0.46 # fb(0).cov.chol[1,1] 1.00 1.63 # L1.eps_M.yM1_f1 0.70 0.65 # L1.eps_M.yM2_f1 0.70 0.67 # L1.eps_M.yM3_f1 0.70 0.68 # L1.eps_Q.yQ1_f1 0.70 0.76 # L1.eps_Q.yQ2_f1 0.70 0.59 # L1.eps_Q.yQ3_f1 0.70 0.62 # sigma2.yM1_f1 0.40 0.39 # sigma2.yM2_f1 0.40 0.41 # sigma2.yM3_f1 0.40 0.40 # sigma2.yQ1_f1 0.40 0.43 # sigma2.yQ2_f1 0.40 0.60 # sigma2.yQ3_f1 0.40 0.59 # These are pretty good: # 1. When we normalize the factor by the first loading, the monthly # variables then all have loading close to 1.0. However, the factor # standard deviation is 0.86 * 1.63 = 1.4, which is higher than it # should be (this is largely due to the existence of the quarterly # variables, which only have 1/3 the number of observations, and make # the estimation more noisy) # 2. Similarly, the idiosyncratic AR(1) and error variance are pretty-well # estimated, although more-so for monthly than quarterly variables # 3. The factor transition is pretty good, although there is some noise in # For the 2-factor block: # The identification for the VAR system means that it is harder to visually # check that the estimation procedure produced good estimates. # This is the invertible matrix that we'll use to transform the factors # and parameter matrices into the original form from scipy.linalg import block_diag M1 = np.kron(np.eye(6), mod['design', 3:5, :2]) M2 = np.kron(np.eye(6), mod['design', 0:1, 12:13]) M = block_diag(M1, M2) Mi = np.linalg.inv(M) # Get the estimated parameter matrices Z = mod['design', :, :18] A = mod['transition', :18, :18] R = mod['selection', :18, :3] Q = block_diag(mod['state_cov', :2, :2], mod['state_cov', 12:13, 12:13]) RQR = R @ Q @ R.T # Create the transformed matrices Z2 = Z @ Mi A2 = M @ A @ Mi Q2 = (M @ RQR @ M.T) # In this example, both endog_M and endog_Q are equal to the factors, # so we expect the loading matrix to look like, which can be confirmed # (up to some numerical precision) by printing Z2 # (where the 1's, 2's, etc. are actually vectors of 1's and 2's, etc.) # [ 0 0 0 0 0 0 0 0 0 0 | 1 0 0 0 0 ] # [ 0 0 0 0 0 0 0 0 0 0 | 1 2 3 2 1 ] # [ I 0 0 0 0 0 0 0 0 | 0 0 0 0 0 ] # [ 1 1 0 0 0 0 0 0 0 0 | 0 0 0 0 0 ] # [ 1 1 2 2 3 3 2 2 1 1 | 0 0 0 0 0 ] print(Z2.round(2)) # Confirm that for the first factor block this is approximately: # [ 0 0 0 0 0 0 0 0 0 0 0.5 -0.2 ] # [ 0 0 0 0 0 0 0 0 0 0 0.1 0.3 ] # and for the second factor block this is approximately # [ 0 0 0 0 0 0.5 ] print(A2.round(2)) # Confirm that this is approximately: # [ 1.5 0.2 ] # [ 0.2 0.5 ] # for the first factor block, and # [1.0] # for the second factor block (note: actually, this seems to be # about [0.3], underestimating this factor's error variance) print(Q2.round(2))
def test_k_factor2_factor_order_6(reset_randomstate): # This tests that the model is correctly set up when the lag order of the # factor is longer than 5 and we have two factors. This is important # because 5 lags are always present when there is quarterly data, but we # want to check that, for example, we haven't accidentally relied on there # being exactly 5 lags available. # Note: as of 2020/07/25, the FRBNY code does not seem to work for 6 lags, # so we can't test against their code endog_M, endog_Q, factors = gen_k_factor2() # Add the factors in to endog_M, which will allow us to identify them, # since endog_M and endog_Q are all the same linear combination of the # factors endog_M_aug = pd.concat([factors, endog_M], axis=1) mod = dynamic_factor_mq.DynamicFactorMQ(endog_M_aug, endog_quarterly=endog_Q, factor_multiplicities=2, factor_orders=6, idiosyncratic_ar1=False, standardize=False) res = mod.fit() # The identification for the VAR system means that it is harder to visually # check that the estimation procedure produced good estimates. # This is the invertible matrix that we'll use to transform the factors # and parameter matrices into the original form M = np.kron(np.eye(6), mod['design', :2, :2]) Mi = np.linalg.inv(M) # Get the estimated parameter matrices Z = mod['design', :, :12] A = mod['transition', :12, :12] R = mod['selection', :12, :2] Q = mod['state_cov', :2, :2] RQR = R @ Q @ R.T # Create the transformed matrices Z2 = Z @ Mi A2 = M @ A @ Mi Q2 = (M @ RQR @ M.T) # In this example, both endog_M and endog_Q are equal to the factors, # so we expect the loading matrix to look like, which can be confirmed # (up to some numerical precision) by printing Z2 # [ I 0 0 0 0 ] # [ I 2I 3I 2I I ] print(Z2.round(2)) desired = np.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 2, 2, 3, 3, 2, 2, 1, 1, 0, 0], [1, 1, 2, 2, 3, 3, 2, 2, 1, 1, 0, 0]]) assert_allclose(Z2, desired, atol=0.1) # Confirm that this is approximately: # [ 0 0 0 0 0 0 0 0 0 0 0.5 -0.2 ] # [ 0 0 0 0 0 0 0 0 0 0 0.1 0.3 ] print(A2.round(2)) desired = np.array( [[0, 0, 0.02, 0, 0.01, -0.03, 0.01, 0.02, 0, -0.01, 0.5, -0.2], [0, 0, 0, 0.02, 0, -0.01, 0, 0, 0, 0.01, 0.1, 0.3], [1., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1., 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1., 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1., 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1., 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1., 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1., 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1., 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1., 0, 0]]) assert_allclose(A2, desired, atol=1e-2) # Confirm that this is approximately: # [ 1.5 0.2 ] # [ 0.2 0.5 ] # in the top left corner, and then zeros elsewhere print(Q2.round(2)) desired = np.array([[1.49, 0.21], [0.21, 0.49]]) assert_allclose(Q2[:2, :2], desired, atol=1e-2) assert_allclose(Q2[:2, 2:], 0, atol=1e-2) assert_allclose(Q2[2:, :2], 0, atol=1e-2) assert_allclose(Q2[2:, 2:], 0, atol=1e-2) # Finally, check that after the transformation, the factors are equal to # endog_M a = res.states.smoothed a2 = (M @ a.T.iloc[:12]).T assert_allclose(endog_M.values, a2.iloc[:, :2].values, atol=1e-10)
def test_smoothed_decomposition_dfm_mq(): # Create the datasets index_M = pd.period_range(start='2000', periods=12, freq='M') index_Q = pd.period_range(start='2000', periods=4, freq='Q') dta_M = pd.DataFrame(np.zeros((12, 2)), index=index_M, columns=['M0', 'M1']) dta_Q = pd.DataFrame(np.zeros((4, 2)), index=index_Q, columns=['Q0', 'Q1']) # Add some noise so the variables aren't constants dta_M.iloc[0] = 1. dta_Q.iloc[1] = 1. # TODO: remove this once we have the intercept contributions figured out dta_M -= dta_M.mean() dta_Q -= dta_Q.mean() # Create the model instance mod = dynamic_factor_mq.DynamicFactorMQ( dta_M, endog_quarterly=dta_Q, factors=1, factor_orders=1, idiosyncratic_ar1=True) params = [ 0.1, -0.4, 0.2, 0.3, # loadings 0.95, 1.0, # factor 0.5, 0.55, 0.6, 0.65, # idio ar(1) 1.1, 1.2, 1.0, 0.9, # idio variances ] res = mod.smooth(params) # Check smoothed state # Get the decomposition of the smoothed state cd, coi, csi, cp = res.get_smoothed_decomposition( decomposition_of='smoothed_state') # Sum across contributions (i.e. from observations at each time period and # from the initial state) css = ((cd + coi).sum(axis=1) + csi.sum(axis=1) + cp.sum(axis=1)) css = css.unstack(level='state_to')[mod.state_names].values # Summing up all contributions should yield the actual smoothed state, # so the smoothed state vector is the desired result of this test ss = np.array(res.states.smoothed) assert_allclose(css, ss, atol=1e-12) # Check smoothed signal # Use the summed state contributions and multiply by the design matrix # to get the smoothed signal csf = (css.T * mod['design'][:, :, None]).sum(axis=1).T # Reverse the standardization csf = (csf.T * mod._endog_std.values[:, None] + mod._endog_mean.values[:, None]).T # Summing up all contributions should yield the smoothed prediction of # the observed variables s_sig = res.predict(information_set='smoothed', signal_only=True) sf = res.predict(information_set='smoothed', signal_only=False) assert_allclose(csf, sf, atol=1e-12) # Now check the smoothed signal against the sum computed from the # decomposed smoothed signal cd, coi, csi, cp = res.get_smoothed_decomposition( decomposition_of='smoothed_signal') # Sum across contributions (i.e. from observations and intercepts at each # time period and from the initial state) to get the smoothed signal cs_sig = ((cd + coi).sum(axis=1) + csi.sum(axis=1) + cp.sum(axis=1)) cs_sig = cs_sig.unstack(level='variable_to')[mod.endog_names].values assert_allclose(cs_sig, s_sig, atol=1e-12) # Add in the observation intercept to get the smoothed forecast csf = cs_sig + mod['obs_intercept'].T assert_allclose(csf, sf)