def prob_mv_grid(bins, cdf, axis=-1): '''helper function for probability of a rectangle grid in a multivariate distribution how does this generalize to more than 2 variates ? bins : tuple tuple of bin edges, currently it is assumed that they broadcast correctly ''' if not isinstance(bins, np.ndarray): bins = lmap(np.asarray, bins) n_dim = len(bins) bins_ = [] #broadcast if binedges are 1d if all(lmap(np.ndim, bins) == np.ones(n_dim)): for d in range(n_dim): sl = [None]*n_dim sl[d] = slice(None) bins_.append(bins[d][sl]) else: #assume it is already correctly broadcasted n_dim = bins.shape[0] bins_ = bins print(len(bins)) cdf_values = cdf(bins_) probs = cdf_values.copy() for d in range(n_dim): probs = np.diff(probs, axis=d) return probs
def date_range_str(start, end=None, length=None): """ Returns a list of abbreviated date strings. Parameters ---------- start : str The first abbreviated date, for instance, '1965q1' or '1965m1' end : str, optional The last abbreviated date if length is None. length : int, optional The length of the returned array of end is None. Returns ------- date_range : list List of strings """ flags = re.IGNORECASE | re.VERBOSE #_check_range_inputs(end, length, freq) start = start.lower() if re.search(_m_pattern, start, flags): annual_freq = 12 split = 'm' elif re.search(_q_pattern, start, flags): annual_freq = 4 split = 'q' elif re.search(_y_pattern, start, flags): annual_freq = 1 start += 'a1' # hack if end: end += 'a1' split = 'a' else: raise ValueError("Date %s not understood" % start) yr1, offset1 = lmap(int, start.replace(":", "").split(split)) if end is not None: end = end.lower() yr2, offset2 = lmap(int, end.replace(":", "").split(split)) length = (yr2 - yr1) * annual_freq + offset2 elif length: yr2 = yr1 + length // annual_freq offset2 = length % annual_freq + (offset1 - 1) years = [str(yr) for yr in np.repeat(lrange(yr1 + 1, yr2), annual_freq)] # tack on first year years = [(str(yr1))] * (annual_freq + 1 - offset1) + years # tack on last year years = years + [(str(yr2))] * offset2 if split != 'a': offset = np.tile(np.arange(1, annual_freq + 1), yr2 - yr1 - 1).astype("a2") offset = np.r_[np.arange(offset1, annual_freq + 1).astype('a2'), offset] offset = np.r_[offset, np.arange(1, offset2 + 1).astype('a2')] date_arr_range = [ ''.join([i, split, asstr(j)]) for i, j in zip(years, offset) ] else: date_arr_range = years.tolist() return date_arr_range
def prob_mv_grid(bins, cdf, axis=-1): '''helper function for probability of a rectangle grid in a multivariate distribution how does this generalize to more than 2 variates ? bins : tuple tuple of bin edges, currently it is assumed that they broadcast correctly ''' if not isinstance(bins, np.ndarray): bins = lmap(np.asarray, bins) n_dim = len(bins) bins_ = [] #broadcast if binedges are 1d if all(lmap(np.ndim, bins) == np.ones(n_dim)): for d in range(n_dim): sl = [None] * n_dim sl[d] = slice(None) bins_.append(bins[d][sl]) else: #assume it is already correctly broadcasted n_dim = bins.shape[0] bins_ = bins print(len(bins)) cdf_values = cdf(bins_) probs = cdf_values.copy() for d in range(n_dim): probs = np.diff(probs, axis=d) return probs
def handle_missing(cls, endog, exog, missing, **kwargs): """ This returns a dictionary with keys endog, exog and the keys of kwargs. It preserves Nones. """ none_array_names = [] if exog is not None: combined = (endog, exog) combined_names = ['endog', 'exog'] else: combined = (endog,) combined_names = ['endog'] none_array_names += ['exog'] # deal with other arrays combined_2d = () combined_2d_names = [] if len(kwargs): for key, value_array in iteritems(kwargs): if value_array is None or value_array.ndim == 0: none_array_names += [key] continue # grab 1d arrays if value_array.ndim == 1: combined += (value_array,) combined_names += [key] elif value_array.squeeze().ndim == 1: combined += (value_array,) combined_names += [key] # grab 2d arrays that are _assumed_ to be symmetric elif value_array.ndim == 2: combined_2d += (value_array,) combined_2d_names += [key] else: raise ValueError("Arrays with more than 2 dimensions " "aren't yet handled") nan_mask = _nan_rows(*combined) if combined_2d: nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d) if missing == 'raise' and np.any(nan_mask): raise MissingDataError("NaNs were encountered in the data") elif missing == 'drop': nan_mask = ~nan_mask drop_nans = lambda x: cls._drop_nans(x, nan_mask) drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask) combined = dict(zip(combined_names, lmap(drop_nans, combined))) if combined_2d: combined.update(dict(zip(combined_2d_names, lmap(drop_nans_2d, combined_2d)))) if none_array_names: combined.update(dict(zip(none_array_names, [None] * len(none_array_names)))) return combined, np.where(~nan_mask)[0].tolist() else: raise ValueError("missing option %s not understood" % missing)
def date_range_str(start, end=None, length=None): """ Returns a list of abbreviated date strings. Parameters ---------- start : str The first abbreviated date, for instance, '1965q1' or '1965m1' end : str, optional The last abbreviated date if length is None. length : int, optional The length of the returned array of end is None. Returns ------- date_range : list List of strings """ flags = re.IGNORECASE | re.VERBOSE #_check_range_inputs(end, length, freq) start = start.lower() if re.search(_m_pattern, start, flags): annual_freq = 12 split = 'm' elif re.search(_q_pattern, start, flags): annual_freq = 4 split = 'q' elif re.search(_y_pattern, start, flags): annual_freq = 1 start += 'a1' # hack if end: end += 'a1' split = 'a' else: raise ValueError("Date %s not understood" % start) yr1, offset1 = lmap(int, start.replace(":","").split(split)) if end is not None: end = end.lower() yr2, offset2 = lmap(int, end.replace(":","").split(split)) length = (yr2 - yr1) * annual_freq + offset2 elif length: yr2 = yr1 + length // annual_freq offset2 = length % annual_freq + (offset1 - 1) years = np.repeat(lrange(yr1+1, yr2), annual_freq).tolist() years = np.r_[[str(yr1)]*(annual_freq+1-offset1), years] # tack on first year years = np.r_[years, [str(yr2)]*offset2] # tack on last year if split != 'a': offset = np.tile(np.arange(1, annual_freq+1), yr2-yr1-1) offset = np.r_[np.arange(offset1, annual_freq+1).astype('a2'), offset] offset = np.r_[offset, np.arange(1,offset2+1).astype('a2')] date_arr_range = [''.join([i, split, asstr(j)]) for i,j in zip(years, offset)] else: date_arr_range = years.tolist() return date_arr_range
def test_plot_month(): dta = sm.datasets.elnino.load_pandas().data dta['YEAR'] = dta.YEAR.astype(int).apply(str) dta = dta.set_index('YEAR').T.unstack() dates = lmap(lambda x: pd.tseries.tools.parse_time_string('1 '+' '.join(x))[0], dta.index.values) # test dates argument fig = month_plot(dta.values, dates=dates, ylabel='el nino') plt.close(fig) # test with a TimeSeries DatetimeIndex with no freq dta.index = pd.DatetimeIndex(dates) fig = month_plot(dta) plt.close(fig) # w freq dta.index = pd.DatetimeIndex(dates, freq='MS') fig = month_plot(dta) plt.close(fig) # test with a TimeSeries PeriodIndex dta.index = pd.PeriodIndex(dates, freq='M') fig = month_plot(dta) plt.close(fig)
def test_plot_month(): dta = sm.datasets.elnino.load_pandas().data dta['YEAR'] = dta.YEAR.astype(int).apply(str) dta = dta.set_index('YEAR').T.unstack() dates = lmap( lambda x: pd.datetools.parse_time_string('1 ' + ' '.join(x))[0], dta.index.values) # test dates argument fig = month_plot(dta.values, dates=dates, ylabel='el nino') plt.close(fig) # test with a TimeSeries DatetimeIndex with no freq dta.index = pd.DatetimeIndex(dates) fig = month_plot(dta) plt.close(fig) # w freq dta.index = pd.DatetimeIndex(dates, freq='M') fig = month_plot(dta) plt.close(fig) # test with a TimeSeries PeriodIndex dta.index = pd.PeriodIndex(dates, freq='M') fig = month_plot(dta) plt.close(fig)
def setup_class(cls): XLISTEXOG2 = 'aget aget2 educyr actlim totchr'.split() endog_name = 'docvis' exog_names = 'private medicaid'.split() + XLISTEXOG2 + ['const'] instrument_names = 'income ssiratio'.split() + XLISTEXOG2 + ['const'] endog = DATA[endog_name] exog = DATA[exog_names] instrument = DATA[instrument_names] asarray = lambda x: np.asarray(x, float) endog, exog, instrument = lmap(asarray, [endog, exog, instrument]) cls.bse_tol = [5e-6, 5e-7] q_tol = [0.04, 0] # compare to Stata default options, iterative GMM # with const at end start = OLS(np.log(endog+1), exog).fit().params nobs, k_instr = instrument.shape w0inv = np.dot(instrument.T, instrument) / nobs mod = gmm.NonlinearIVGMM(endog, exog, instrument, moment_exponential_add) res0 = mod.fit(start, maxiter=0, inv_weights=w0inv, optim_method='bfgs', optim_args={'gtol':1e-8, 'disp': 0}, wargs={'centered':False}) cls.res1 = res0 from .results_gmm_poisson import results_addonestep as results cls.res2 = results
def dataset(self, as_dict=False): """ Returns a Python generator object for iterating over the dataset. Parameters ---------- as_dict : bool, optional If as_dict is True, yield each row of observations as a dict. If False, yields each row of observations as a list. Returns ------- Generator object for iterating over the dataset. Yields each row of observations as a list by default. Notes ----- If missing_values is True during instantiation of StataReader then observations with _StataMissingValue(s) are not filtered and should be handled by your applcation. """ try: self._file.seek(self._data_location) except Exception: pass if as_dict: vars = lmap(str, self.variables()) for i in range(len(self)): yield dict(zip(vars, self._next())) else: for i in range(self._header['nobs']): yield self._next()
def dataset(self, as_dict=False): """ Returns a Python generator object for iterating over the dataset. Parameters ---------- as_dict : bool, optional If as_dict is True, yield each row of observations as a dict. If False, yields each row of observations as a list. Returns ------- Generator object for iterating over the dataset. Yields each row of observations as a list by default. Notes ----- If missing_values is True during instantiation of StataReader then observations with StataMissingValue(s) are not filtered and should be handled by your applcation. """ try: self._file.seek(self._data_location) except Exception: pass if as_dict: vars = lmap(str, self.variables()) for i in range(len(self)): yield dict(zip(vars, self._next())) else: for i in range(self._header['nobs']): yield self._next()
def setup_class(cls): # compare to Stata default options, twostep GMM XLISTEXOG2 = 'aget aget2 educyr actlim totchr'.split() endog_name = 'docvis' exog_names = 'private medicaid'.split() + XLISTEXOG2 + ['const'] instrument_names = 'income medicaid ssiratio'.split() + XLISTEXOG2 + ['const'] endog = DATA[endog_name] exog = DATA[exog_names] instrument = DATA[instrument_names] asarray = lambda x: np.asarray(x, float) endog, exog, instrument = lmap(asarray, [endog, exog, instrument]) # Need to add all data into exog endog_ = np.zeros(len(endog)) exog_ = np.column_stack((endog, exog)) cls.bse_tol = [5e-6, 5e-7] # compare to Stata default options, iterative GMM # with const at end start = OLS(endog, exog).fit().params nobs, k_instr = instrument.shape w0inv = np.dot(instrument.T, instrument) / nobs mod = gmm.NonlinearIVGMM(endog_, exog_, instrument, moment_exponential_mult) res0 = mod.fit(start, maxiter=2, inv_weights=w0inv, optim_method='bfgs', optim_args={'gtol':1e-8, 'disp': 0}, wargs={'centered':False}, has_optimal_weights=False) cls.res1 = res0 from .results_gmm_poisson import results_multtwostep as results cls.res2 = results
def test_plot_quarter(): dta = sm.datasets.macrodata.load_pandas().data dates = lmap( 'Q'.join, zip( dta.year.astype(int).apply(str), dta.quarter.astype(int).apply(str))) # test dates argument quarter_plot(dta.unemp.values, dates) plt.close('all') # test with a DatetimeIndex with no freq parser = pd.datetools.parse_time_string dta.set_index(pd.DatetimeIndex((x[0] for x in map(parser, dates))), inplace=True) quarter_plot(dta.unemp) plt.close('all') # w freq # see pandas #6631 dta.index = pd.DatetimeIndex((x[0] for x in map(parser, dates)), freq='QS-Oct') quarter_plot(dta.unemp) plt.close('all') # w PeriodIndex dta.index = pd.PeriodIndex((x[0] for x in map(parser, dates)), freq='Q') quarter_plot(dta.unemp) plt.close('all')
def anova_oneway(y, x, seq=0): # new version to match NIST # no generalization or checking of arguments, tested only for 1d yrvs = y[:,np.newaxis] #- min(y) #subracting mean increases numerical accuracy for NIST test data sets xrvs = x[:,np.newaxis] - x.mean() #for 1d#- 1e12 trick for 'SmLs09.dat' meang, varg, xdevmeangr, countg = groupsstats_dummy(yrvs[:,:1], xrvs[:,:1])#, seq=0) #the following does not work as replacement #gcount, gmean , meanarr, withinvar, withinvararr = groupstatsbin(y, x)#, seq=0) sswn = np.dot(xdevmeangr.T,xdevmeangr) ssbn = np.dot((meang-xrvs.mean())**2, countg.T) nobs = yrvs.shape[0] ncat = meang.shape[1] dfbn = ncat - 1 dfwn = nobs - ncat msb = ssbn/float(dfbn) msw = sswn/float(dfwn) f = msb/msw prob = stats.f.sf(f,dfbn,dfwn) R2 = (ssbn/(sswn+ssbn)) #R-squared resstd = np.sqrt(msw) #residual standard deviation #print(f, prob def _fix2scalar(z): # return number if np.shape(z) == (1, 1): return z[0,0] else: return z f, prob, R2, resstd = lmap(_fix2scalar, (f, prob, R2, resstd)) return f, prob, R2, resstd
def test_plot_quarter(): dta = sm.datasets.macrodata.load_pandas().data dates = lmap('Q'.join, zip(dta.year.astype(int).apply(str), dta.quarter.astype(int).apply(str))) # test dates argument quarter_plot(dta.unemp.values, dates) plt.close('all') # test with a DatetimeIndex with no freq parser = pd.tseries.tools.parse_time_string dta.set_index(pd.DatetimeIndex((x[0] for x in map(parser, dates))), inplace=True) quarter_plot(dta.unemp) plt.close('all') # w freq # see pandas #6631 dta.index = pd.DatetimeIndex((x[0] for x in map(parser, dates)), freq='QS-Oct') quarter_plot(dta.unemp) plt.close('all') # w PeriodIndex dta.index = pd.PeriodIndex((x[0] for x in map(parser, dates)), freq='Q') quarter_plot(dta.unemp) plt.close('all')
def anova_oneway(y, x, seq=0): # new version to match NIST # no generalization or checking of arguments, tested only for 1d yrvs = y[:, np.newaxis] #- min(y) #subracting mean increases numerical accuracy for NIST test data sets xrvs = x[:, np.newaxis] - x.mean() #for 1d#- 1e12 trick for 'SmLs09.dat' meang, varg, xdevmeangr, countg = groupsstats_dummy( yrvs[:, :1], xrvs[:, :1]) #, seq=0) # noqa:F821 See GH#5756 #the following does not work as replacement #from .try_catdata import groupsstats_dummy, groupstatsbin #gcount, gmean , meanarr, withinvar, withinvararr = groupstatsbin(y, x)#, seq=0) sswn = np.dot(xdevmeangr.T, xdevmeangr) ssbn = np.dot((meang - xrvs.mean())**2, countg.T) nobs = yrvs.shape[0] ncat = meang.shape[1] dfbn = ncat - 1 dfwn = nobs - ncat msb = ssbn / float(dfbn) msw = sswn / float(dfwn) f = msb / msw prob = stats.f.sf(f, dfbn, dfwn) R2 = (ssbn / (sswn + ssbn)) #R-squared resstd = np.sqrt(msw) #residual standard deviation #print(f, prob def _fix2scalar(z): # return number if np.shape(z) == (1, 1): return z[0, 0] else: return z f, prob, R2, resstd = lmap(_fix2scalar, (f, prob, R2, resstd)) return f, prob, R2, resstd
def bootstrap(distr, args=(), nobs=200, nrep=100, value=None, batch_size=None): '''Monte Carlo (or parametric bootstrap) p-values for gof currently hardcoded for A^2 only assumes vectorized fit_vec method, builds and analyses (nobs, nrep) sample in one step rename function to less generic this works also with nrep=1 ''' #signature similar to kstest ? #delegate to fn ? #rvs_kwds = {'size':(nobs, nrep)} #rvs_kwds.update(kwds) #it will be better to build a separate batch function that calls bootstrap #keep batch if value is true, but batch iterate from outside if stat is returned if batch_size is not None: if value is None: raise ValueError('using batching requires a value') n_batch = int(np.ceil(nrep/float(batch_size))) count = 0 for irep in range(n_batch): rvs = distr.rvs(args, **{'size':(batch_size, nobs)}) params = distr.fit_vec(rvs, axis=1) params = lmap(lambda x: np.expand_dims(x, 1), params) cdfvals = np.sort(distr.cdf(rvs, params), axis=1) stat = asquare(cdfvals, axis=1) count += (stat >= value).sum() return count / float(n_batch * batch_size) else: #rvs = distr.rvs(args, **kwds) #extension to distribution kwds ? rvs = distr.rvs(args, **{'size':(nrep, nobs)}) params = distr.fit_vec(rvs, axis=1) params = lmap(lambda x: np.expand_dims(x, 1), params) cdfvals = np.sort(distr.cdf(rvs, params), axis=1) stat = asquare(cdfvals, axis=1) if value is None: #return all bootstrap results stat_sorted = np.sort(stat) return stat_sorted else: #calculate and return specific p-value return (stat >= value).mean()
def _col_size(self, k=None): """Calculate size of a data record.""" if len(self._col_sizes) == 0: self._col_sizes = lmap(lambda x: self._calcsize(x), self._header["typlist"]) if k == None: return self._col_sizes else: return self._col_sizes[k]
def _col_size(self, k=None): """Calculate size of a data record.""" if len(self._col_sizes) == 0: self._col_sizes = lmap(lambda x: self._calcsize(x), self._header['typlist']) if k is None: return self._col_sizes else: return self._col_sizes[k]
def variables(self): """ Returns a list of the dataset's StataVariables objects. """ return lmap(_StataVariable, zip(lrange(self._header['nvar']), self._header['typlist'], self._header['varlist'], self._header['srtlist'], self._header['fmtlist'], self._header['lbllist'], self._header['vlblist']))
def test_panel_robust_cov(): import pandas as pa import statsmodels.datasets.grunfeld as gr from .results.results_panelrobust import results as res_stata dtapa = gr.data.load_pandas() #Stata example/data seems to miss last firm dtapa_endog = dtapa.endog[:200] dtapa_exog = dtapa.exog[:200] res = OLS(dtapa_endog, add_constant(dtapa_exog[['value', 'capital']], prepend=False)).fit() #time indicator in range(max Ti) time = np.asarray(dtapa_exog[['year']]) time -= time.min() time = np.squeeze(time).astype(int) #sw.cov_nw_panel requires bounds instead of index tidx = [(i * 20, 20 * (i + 1)) for i in range(10)] #firm index in range(n_firms) firm_names, firm_id = np.unique(np.asarray(dtapa_exog[['firm']], 'S20'), return_inverse=True) #panel newey west standard errors cov = sw.cov_nw_panel(res, 0, tidx, use_correction='hac') #dropping numpy 1.4 soon #np.testing.assert_allclose(cov, res_stata.cov_pnw0_stata, rtol=1e-6) assert_almost_equal(cov, res_stata.cov_pnw0_stata, decimal=4) cov = sw.cov_nw_panel(res, 1, tidx, use_correction='hac') #np.testing.assert_allclose(cov, res_stata.cov_pnw1_stata, rtol=1e-6) assert_almost_equal(cov, res_stata.cov_pnw1_stata, decimal=4) cov = sw.cov_nw_panel(res, 4, tidx) #check default #np.testing.assert_allclose(cov, res_stata.cov_pnw4_stata, rtol=1e-6) assert_almost_equal(cov, res_stata.cov_pnw4_stata, decimal=4) #cluster robust standard errors cov_clu = sw.cov_cluster(res, firm_id) assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4) #cluster robust standard errors, non-int groups cov_clu = sw.cov_cluster(res, lmap(str, firm_id)) assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4) #Driscoll and Kraay panel robust standard errors rcov = sw.cov_nw_groupsum(res, 0, time, use_correction=0) assert_almost_equal(rcov, res_stata.cov_dk0_stata, decimal=4) rcov = sw.cov_nw_groupsum(res, 1, time, use_correction=0) assert_almost_equal(rcov, res_stata.cov_dk1_stata, decimal=4) rcov = sw.cov_nw_groupsum(res, 4, time) #check default assert_almost_equal(rcov, res_stata.cov_dk4_stata, decimal=4)
def test_panel_robust_cov(): import pandas as pa import statsmodels.datasets.grunfeld as gr from .results.results_panelrobust import results as res_stata dtapa = gr.data.load_pandas() # Stata example/data seems to miss last firm dtapa_endog = dtapa.endog[:200] dtapa_exog = dtapa.exog[:200] res = OLS(dtapa_endog, add_constant(dtapa_exog[["value", "capital"]], prepend=False)).fit() # time indicator in range(max Ti) time = np.asarray(dtapa_exog[["year"]]) time -= time.min() time = np.squeeze(time).astype(int) # sw.cov_nw_panel requires bounds instead of index tidx = [(i * 20, 20 * (i + 1)) for i in range(10)] # firm index in range(n_firms) firm_names, firm_id = np.unique(np.asarray(dtapa_exog[["firm"]], "S20"), return_inverse=True) # panel newey west standard errors cov = sw.cov_nw_panel(res, 0, tidx, use_correction="hac") # dropping numpy 1.4 soon # np.testing.assert_allclose(cov, res_stata.cov_pnw0_stata, rtol=1e-6) assert_almost_equal(cov, res_stata.cov_pnw0_stata, decimal=4) cov = sw.cov_nw_panel(res, 1, tidx, use_correction="hac") # np.testing.assert_allclose(cov, res_stata.cov_pnw1_stata, rtol=1e-6) assert_almost_equal(cov, res_stata.cov_pnw1_stata, decimal=4) cov = sw.cov_nw_panel(res, 4, tidx) # check default # np.testing.assert_allclose(cov, res_stata.cov_pnw4_stata, rtol=1e-6) assert_almost_equal(cov, res_stata.cov_pnw4_stata, decimal=4) # cluster robust standard errors cov_clu = sw.cov_cluster(res, firm_id) assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4) # cluster robust standard errors, non-int groups cov_clu = sw.cov_cluster(res, lmap(str, firm_id)) assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4) # Driscoll and Kraay panel robust standard errors rcov = sw.cov_nw_groupsum(res, 0, time, use_correction=0) assert_almost_equal(rcov, res_stata.cov_dk0_stata, decimal=4) rcov = sw.cov_nw_groupsum(res, 1, time, use_correction=0) assert_almost_equal(rcov, res_stata.cov_dk1_stata, decimal=4) rcov = sw.cov_nw_groupsum(res, 4, time) # check default assert_almost_equal(rcov, res_stata.cov_dk4_stata, decimal=4)
def data2proddummy(x): '''creates product dummy variables from 2 columns of 2d array drops last dummy variable, but not from each category singular with simple dummy variable but not with constant quickly written, no safeguards ''' #brute force, assumes x is 2d #replace with encoding if possible groups = np.unique(lmap(tuple, x.tolist())) #includes singularity with additive factors return (x == groups[:, None, :]).all(-1).T.astype(int)[:, :-1]
def data2proddummy(x): '''creates product dummy variables from 2 columns of 2d array drops last dummy variable, but not from each category singular with simple dummy variable but not with constant quickly written, no safeguards ''' #brute force, assumes x is 2d #replace with encoding if possible groups = np.unique(lmap(tuple, x.tolist())) #includes singularity with additive factors return (x==groups[:,None,:]).all(-1).T.astype(int)[:,:-1]
def _next(self): typlist = self._header["typlist"] if self._has_string_data: data = [None] * self._header["nvar"] for i in range(len(data)): if isinstance(typlist[i], int): data[i] = self._null_terminate(self._file.read(typlist[i]), self._encoding) else: data[i] = self._unpack(typlist[i], self._file.read(self._col_size(i))) return data else: return lmap( lambda i: self._unpack(typlist[i], self._file.read(self._col_size(i))), lrange(self._header["nvar"]) )
def _next(self): typlist = self._header['typlist'] if self._has_string_data: data = [None]*self._header['nvar'] for i in range(len(data)): if isinstance(typlist[i], int): data[i] = self._null_terminate(self._file.read(typlist[i]), self._encoding) else: data[i] = self._unpack(typlist[i], self._file.read(self._col_size(i))) return data else: return lmap(lambda i: self._unpack(typlist[i], self._file.read(self._col_size(i))), lrange(self._header['nvar']))
def variables(self): """ Returns a list of the dataset's StataVariables objects. """ return lmap( _StataVariable, zip( lrange(self._header["nvar"]), self._header["typlist"], self._header["varlist"], self._header["srtlist"], self._header["fmtlist"], self._header["lbllist"], self._header["vlblist"], ), )
def dates_from_str(dates): """ Turns a sequence of date strings and returns a list of datetime. Parameters ---------- dates : array-like A sequence of abbreviated dates as string. For instance, '1996m1' or '1996Q1'. The datetime dates are at the end of the period. Returns ------- date_list : array A list of datetime types. """ return lmap(date_parser, dates)
def dates_from_str(dates): """ Turns a sequence of date strings and returns a list of datetime. Parameters ---------- dates : array_like A sequence of abbreviated dates as string. For instance, '1996m1' or '1996Q1'. The datetime dates are at the end of the period. Returns ------- date_list : array A list of datetime types. """ return lmap(date_parser, dates)
def print_summary(self, stats, orientation='auto'): #TODO: need to specify a table formating for the numbers, using defualt title = 'Summary Statistics' header = stats stubs = self.univariate['obs'][1] data = [[self.univariate[astat][2][col] for astat in stats] for col in range(len(self.univariate['obs'][2]))] if (orientation == 'varcols') or \ (orientation == 'auto' and len(stubs) < len(header)): #swap rows and columns data = lmap(lambda *row: list(row), *data) header, stubs = stubs, header part_fmt = dict(data_fmts=["%#8.4g"] * (len(header) - 1)) table = SimpleTable(data, header, stubs, title=title, txt_fmt=part_fmt) return table
def test_plot_quarter(close_figures): dta = sm.datasets.macrodata.load_pandas().data dates = lmap('Q'.join, zip(dta.year.astype(int).apply(str), dta.quarter.astype(int).apply(str))) # test dates argument quarter_plot(dta.unemp.values, dates) # test with a DatetimeIndex with no freq dta.set_index(pd.to_datetime(dates), inplace=True) quarter_plot(dta.unemp) # w freq # see pandas #6631 dta.index = pd.DatetimeIndex(pd.to_datetime(dates), freq='QS-Oct') quarter_plot(dta.unemp) # w PeriodIndex dta.index = pd.PeriodIndex(pd.to_datetime(dates), freq='Q') quarter_plot(dta.unemp)
def summary_return(tables, return_fmt='text'): # join table parts then print if return_fmt == 'text': strdrop = lambda x: str(x).rsplit('\n', 1)[0] # convert to string drop last line return '\n'.join(lmap(strdrop, tables[:-1]) + [str(tables[-1])]) elif return_fmt == 'tables': return tables elif return_fmt == 'csv': return '\n'.join(x.as_csv() for x in tables) elif return_fmt == 'latex': # TODO: insert \hline after updating SimpleTable table = copy.deepcopy(tables[0]) for part in tables[1:]: table.extend(part) return table.as_latex_tabular() elif return_fmt == 'html': return "\n".join(table.as_html() for table in tables) else: raise ValueError('available output formats are text, csv, latex, html')
def _get_colwidths(self, output_format, **fmt_dict): """Return list, the calculated widths of each column.""" output_format = get_output_format(output_format) fmt = self.output_formats[output_format].copy() fmt.update(fmt_dict) ncols = max(len(row) for row in self) request = fmt.get('colwidths') if request is 0: # assume no extra space desired (e.g, CSV) return [0] * ncols elif request is None: # assume no extra space desired (e.g, CSV) request = [0] * ncols elif isinstance(request, (int, long)): request = [request] * ncols elif len(request) < ncols: request = [request[i % len(request)] for i in range(ncols)] min_widths = [] for col in zip(*self): maxwidth = max(len(c.format(0, output_format, **fmt)) for c in col) min_widths.append(maxwidth) result = lmap(max, min_widths, request) return result
def _get_colwidths(self, output_format, **fmt_dict): """Return list, the calculated widths of each column.""" output_format = get_output_format(output_format) fmt = self.output_formats[output_format].copy() fmt.update(fmt_dict) ncols = max(len(row) for row in self) request = fmt.get('colwidths') if request == 0: # assume no extra space desired (e.g, CSV) return [0] * ncols elif request is None: # assume no extra space desired (e.g, CSV) request = [0] * ncols elif isinstance(request, (int, long)): request = [request] * ncols elif len(request) < ncols: request = [request[i % len(request)] for i in range(ncols)] min_widths = [] for col in zip(*self): maxwidth = max(len(c.format(0, output_format, **fmt)) for c in col) min_widths.append(maxwidth) result = lmap(max, min_widths, request) return result
def summary_return(tables, return_fmt='text'): # join table parts then print if return_fmt == 'text': strdrop = lambda x: str(x).rsplit('\n',1)[0] # convert to string drop last line return '\n'.join(lmap(strdrop, tables[:-1]) + [str(tables[-1])]) elif return_fmt == 'tables': return tables elif return_fmt == 'csv': return '\n'.join(x.as_csv() for x in tables) elif return_fmt == 'latex': # TODO: insert \hline after updating SimpleTable table = copy.deepcopy(tables[0]) del table[-1] for part in tables[1:]: table.extend(part) return table.as_latex_tabular() elif return_fmt == 'html': return "\n".join(table.as_html() for table in tables) else: raise ValueError('available output formats are text, csv, latex, html')
def print_summary(self, stats, orientation='auto'): #TODO: need to specify a table formating for the numbers, using defualt title = 'Summary Statistics' header = stats stubs = self.univariate['obs'][1] data = [[self.univariate[astat][2][col] for astat in stats] for col in range(len(self.univariate['obs'][2]))] if (orientation == 'varcols') or \ (orientation == 'auto' and len(stubs) < len(header)): #swap rows and columns data = lmap(lambda *row: list(row), *data) header, stubs = stubs, header part_fmt = dict(data_fmts = ["%#8.4g"]*(len(header)-1)) table = SimpleTable(data, header, stubs, title=title, txt_fmt = part_fmt) return table
import scikits.timeseries as ts d1 = ts.Date(year=1700, freq='A') #NOTE: have to have yearBegin offset for annual data until parser rewrite #should this be up to the user, or should it be done in TSM init? #NOTE: not anymore, it's end of year now ts_dr = ts.date_array(start_date=d1, length=len(sunspots.endog)) pandas_dr = pandas.DateRange(start=d1.datetime, periods=len(sunspots.endog), timeRule='A@DEC') #pandas_dr = pandas_dr.shift(-1, pandas.datetools.yearBegin) dates = np.arange(1700, 1700 + len(sunspots.endog)) dates = ts.date_array(dates, freq='A') #sunspots = pandas.TimeSeries(sunspots.endog, index=dates) #NOTE: pandas only does business days for dates it looks like import datetime dt_dates = np.asarray( lmap(datetime.datetime.fromordinal, ts_dr.toordinal().astype(int))) sunspots = pandas.TimeSeries(sunspots.endog, index=dt_dates) #NOTE: pandas can't handle pre-1900 dates mod = AR(sunspots, freq='A') res = mod.fit(method='mle', maxlag=9) # some data for an example in Box Jenkins IBM = np.asarray([460, 457, 452, 459, 462, 459, 463, 479, 493, 490.]) w = np.diff(IBM) theta = .5
def categorical(data, col=None, dictnames=False, drop=False, ): ''' Returns a dummy matrix given an array of categorical variables. Parameters ---------- data : array A structured array, recarray, or array. This can be either a 1d vector of the categorical variable or a 2d array with the column specifying the categorical variable specified by the col argument. col : 'string', int, or None If data is a structured array or a recarray, `col` can be a string that is the name of the column that contains the variable. For all arrays `col` can be an int that is the (zero-based) column index number. `col` can only be None for a 1d array. The default is None. dictnames : bool, optional If True, a dictionary mapping the column number to the categorical name is returned. Used to have information about plain arrays. drop : bool Whether or not keep the categorical variable in the returned matrix. Returns -------- dummy_matrix, [dictnames, optional] A matrix of dummy (indicator/binary) float variables for the categorical data. If dictnames is True, then the dictionary is returned as well. Notes ----- This returns a dummy variable for EVERY distinct variable. If a a structured or recarray is provided, the names for the new variable is the old variable name - underscore - category name. So if the a variable 'vote' had answers as 'yes' or 'no' then the returned array would have to new variables-- 'vote_yes' and 'vote_no'. There is currently no name checking. Examples -------- >>> import numpy as np >>> import statsmodels.api as sm Univariate examples >>> import string >>> string_var = [string.lowercase[0:5], string.lowercase[5:10], \ string.lowercase[10:15], string.lowercase[15:20], \ string.lowercase[20:25]] >>> string_var *= 5 >>> string_var = np.asarray(sorted(string_var)) >>> design = sm.tools.categorical(string_var, drop=True) Or for a numerical categorical variable >>> instr = np.floor(np.arange(10,60, step=2)/10) >>> design = sm.tools.categorical(instr, drop=True) With a structured array >>> num = np.random.randn(25,2) >>> struct_ar = np.zeros((25,1), dtype=[('var1', 'f4'),('var2', 'f4'), \ ('instrument','f4'),('str_instr','a5')]) >>> struct_ar['var1'] = num[:,0][:,None] >>> struct_ar['var2'] = num[:,1][:,None] >>> struct_ar['instrument'] = instr[:,None] >>> struct_ar['str_instr'] = string_var[:,None] >>> design = sm.tools.categorical(struct_ar, col='instrument', drop=True) Or >>> design2 = sm.tools.categorical(struct_ar, col='str_instr', drop=True) ''' if isinstance(col, (list, tuple)): try: assert len(col) == 1 col = col[0] except: raise ValueError("Can only convert one column at a time") # TODO: add a NameValidator function # catch recarrays and structured arrays if data.dtype.names or data.__class__ is np.recarray: if not col and np.squeeze(data).ndim > 1: raise IndexError("col is None and the input array is not 1d") if isinstance(col, int): col = data.dtype.names[col] if col is None and data.dtype.names and len(data.dtype.names) == 1: col = data.dtype.names[0] tmp_arr = np.unique(data[col]) # if the cols are shape (#,) vs (#,1) need to add an axis and flip _swap = True if data[col].ndim == 1: tmp_arr = tmp_arr[:, None] _swap = False tmp_dummy = (tmp_arr == data[col]).astype(float) if _swap: tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0) if not tmp_arr.dtype.names: # how do we get to this code path? tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr)] elif tmp_arr.dtype.names: tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr.tolist())] # prepend the varname and underscore, if col is numeric attribute # lookup is lost for recarrays... if col is None: try: col = data.dtype.names[0] except: col = 'var' # TODO: the above needs to be made robust because there could be many # var_yes, var_no varaibles for instance. tmp_arr = [col + '_' + item for item in tmp_arr] # TODO: test this for rec and structured arrays!!! if drop is True: if len(data.dtype) <= 1: if tmp_dummy.shape[0] < tmp_dummy.shape[1]: tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0) dt = lzip(tmp_arr, [tmp_dummy.dtype.str]*len(tmp_arr)) # preserve array type return np.array(lmap(tuple, tmp_dummy.tolist()), dtype=dt).view(type(data)) data = nprf.drop_fields(data, col, usemask=False, asrecarray=type(data) is np.recarray) data = nprf.append_fields(data, tmp_arr, data=tmp_dummy, usemask=False, asrecarray=type(data) is np.recarray) return data # handle ndarrays and catch array-like for an error elif data.__class__ is np.ndarray or not isinstance(data, np.ndarray): if not isinstance(data, np.ndarray): raise NotImplementedError("Array-like objects are not supported") if isinstance(col, int): offset = data.shape[1] # need error catching here? tmp_arr = np.unique(data[:, col]) tmp_dummy = (tmp_arr[:, np.newaxis] == data[:, col]).astype(float) tmp_dummy = tmp_dummy.swapaxes(1, 0) if drop is True: offset -= 1 data = np.delete(data, col, axis=1).astype(float) data = np.column_stack((data, tmp_dummy)) if dictnames is True: col_map = _make_dictnames(tmp_arr, offset) return data, col_map return data elif col is None and np.squeeze(data).ndim == 1: tmp_arr = np.unique(data) tmp_dummy = (tmp_arr[:, None] == data).astype(float) tmp_dummy = tmp_dummy.swapaxes(1, 0) if drop is True: if dictnames is True: col_map = _make_dictnames(tmp_arr) return tmp_dummy, col_map return tmp_dummy else: data = np.column_stack((data, tmp_dummy)) if dictnames is True: col_map = _make_dictnames(tmp_arr, offset=1) return data, col_map return data else: raise IndexError("The index %s is not understood" % col)
def lstsq(a, b, cond=None, overwrite_a=0, overwrite_b=0): """Compute least-squares solution to equation :m:`a x = b` Compute a vector x such that the 2-norm :m:`|b - a x|` is minimised. Parameters ---------- a : array, shape (M, N) b : array, shape (M,) or (M, K) cond : float Cutoff for 'small' singular values; used to determine effective rank of a. Singular values smaller than rcond*largest_singular_value are considered zero. overwrite_a : boolean Discard data in a (may enhance performance) overwrite_b : boolean Discard data in b (may enhance performance) Returns ------- x : array, shape (N,) or (N, K) depending on shape of b Least-squares solution residues : array, shape () or (1,) or (K,) Sums of residues, squared 2-norm for each column in :m:`b - a x` If rank of matrix a is < N or > M this is an empty array. If b was 1-d, this is an (1,) shape array, otherwise the shape is (K,) rank : integer Effective rank of matrix a s : array, shape (min(M,N),) Singular values of a. The condition number of a is abs(s[0]/s[-1]). Raises LinAlgError if computation does not converge """ a1, b1 = lmap(asarray_chkfinite, (a, b)) if a1.ndim != 2: raise ValueError('expected matrix') m, n = a1.shape if b1.ndim == 2: nrhs = b1.shape[1] else: nrhs = 1 if m != b1.shape[0]: raise ValueError('incompatible dimensions') gelss, = get_lapack_funcs(('gelss',), (a1, b1)) if n > m: # need to extend b matrix as it will be filled with # a larger solution matrix b2 = zeros((n, nrhs), dtype=gelss.dtype) if b1.ndim == 2: b2[:m, :] = b1 else: b2[:m, 0] = b1 b1 = b2 overwrite_a = overwrite_a or (a1 is not a and not hasattr(a, '__array__')) overwrite_b = overwrite_b or (b1 is not b and not hasattr(b, '__array__')) if gelss.module_name[:7] == 'flapack': lwork = calc_lwork.gelss(gelss.prefix, m, n, nrhs)[1] v, x, s, rank, info = gelss(a1, b1, cond=cond, lwork=lwork, overwrite_a=overwrite_a, overwrite_b=overwrite_b) else: raise NotImplementedError('calling gelss from %s' % gelss.module_name) if info > 0: raise LinAlgError("SVD did not converge in Linear Least Squares") if info < 0: raise ValueError('illegal value in %-th argument of ' 'internal gelss' % -info) resids = asarray([], dtype=x.dtype) if n < m: x1 = x[:n] if rank == n: resids = sum(x[n:]**2, axis=0) x = x1 return x, resids, rank, s
prepend=True) # for R comparison #prepend=False) # for Stata comparison Z = add_constant(griliches76_data[['expr', 'tenure', 'rns', 'smsa', \ 'D_67', 'D_68', 'D_69', 'D_70', 'D_71', 'D_73', 'med', 'kww', 'age', 'mrt']]) Y = griliches76_data['lw'] return Y, X, Z # use module global to load only once yg_df, xg_df, zg_df = get_griliches76_data() endog = np.asarray(yg_df, dtype=float) # TODO: why is yg_df float32 exog, instrument = lmap(np.asarray, [xg_df, zg_df]) assert exog.dtype == np.float64 assert instrument.dtype == np.float64 # from R #----------------- varnames = np.array([ "(Intercept)", "s", "iq", "expr", "tenure", "rns", "smsa", "D_67", "D_68", "D_69", "D_70", "D_71", "D_73" ]) params = np.array([ 4.03350989, 0.17242531, -0.00909883, 0.04928949, 0.04221709, -0.10179345, 0.12611095, -0.05961711, 0.04867956, 0.15281763, 0.17443605, 0.09166597, 0.09323977 ])
def genfromdta(fname, missing_flt=-999., encoding=None, pandas=False, convert_dates=True): """ Returns an ndarray or DataFrame from a Stata .dta file. Parameters ---------- fname : str or filehandle Stata .dta file. missing_flt : numeric The numeric value to replace missing values with. Will be used for any numeric value. encoding : string, optional Used for Python 3 only. Encoding to use when reading the .dta file. Defaults to `locale.getpreferredencoding` pandas : bool Optionally return a DataFrame instead of an ndarray convert_dates : bool If convert_dates is True, then Stata formatted dates will be converted to datetime types according to the variable's format. """ warnings.warn( "genfromdta is deprecated as of 0.10.0 and will be removed in a " "future version. Use pandas.read_stata instead.", FutureWarning) if isinstance(fname, string_types): fhd = StataReader(open(fname, 'rb'), missing_values=False, encoding=encoding) elif not hasattr(fname, 'read'): raise TypeError("The input should be a string or a filehandle. "\ "(got %s instead)" % type(fname)) else: fhd = StataReader(fname, missing_values=False, encoding=encoding) # validate_names = np.lib._iotools.NameValidator(excludelist=excludelist, # deletechars=deletechars, # case_sensitive=case_sensitive) #TODO: This needs to handle the byteorder? header = fhd.file_headers() types = header['dtyplist'] nobs = header['nobs'] numvars = header['nvar'] varnames = header['varlist'] fmtlist = header['fmtlist'] dataname = header['data_label'] labels = header['vlblist'] # labels are thrown away unless DataArray # type is used data = np.zeros((nobs, numvars)) stata_dta = fhd.dataset() dt = np.dtype(lzip(varnames, types)) data = np.zeros((nobs), dtype=dt) # init final array for rownum, line in enumerate(stata_dta): # doesn't handle missing value objects, just casts # None will only work without missing value object. if None in line: for i, val in enumerate(line): #NOTE: This will only be scalar types because missing strings # are empty not None in Stata if val is None: line[i] = missing_flt data[rownum] = tuple(line) if pandas: from pandas import DataFrame data = DataFrame.from_records(data) if convert_dates: cols = np.where(lmap(lambda x: x in _date_formats, fmtlist))[0] for col in cols: i = col col = data.columns[col] data[col] = data[col].apply(_stata_elapsed_date_to_datetime, args=(fmtlist[i], )) elif convert_dates: #date_cols = np.where(map(lambda x : x in _date_formats, # fmtlist))[0] # make the dtype for the datetime types cols = np.where(lmap(lambda x: x in _date_formats, fmtlist))[0] dtype = data.dtype.descr dtype = [(sub_dtype[0], object) if i in cols else sub_dtype for i, sub_dtype in enumerate(dtype)] data = data.astype(dtype) # have to copy for col in cols: def convert(x): return _stata_elapsed_date_to_datetime(x, fmtlist[col]) data[data.dtype.names[col]] = lmap(convert, data[data.dtype.names[col]]) return data
print(shannonentropy(Y)) p = [1e-5,1e-4,.001,.01,.1,.15,.2,.25,.3,.35,.4,.45,.5] plt.subplot(111) plt.ylabel("Information") plt.xlabel("Probability") x = np.linspace(0,1,100001) plt.plot(x, shannoninfo(x)) # plt.show() plt.subplot(111) plt.ylabel("Entropy") plt.xlabel("Probability") x = np.linspace(0,1,101) plt.plot(x, lmap(shannonentropy, lzip(x,1-x))) # plt.show() # define a joint probability distribution # from Golan (2008) table 3.3 w = np.array([[0,0,1./3],[1/9.,1/9.,1/9.],[1/18.,1/9.,1/6.]]) # table 3.4 px = w.sum(0) py = w.sum(1) H_X = shannonentropy(px) H_Y = shannonentropy(py) H_XY = shannonentropy(w) H_XgivenY = condentropy(px,py,w) H_YgivenX = condentropy(py,px,w) # note that cross-entropy is not a distance measure as the following shows D_YX = logbasechange(2,np.e)*stats.entropy(px, py)
def seasonal_decompose(x, model="additive", filt=None, freq=None): """ Parameters ---------- x : array-like Time series model : str {"additive", "multiplicative"} Type of seasonal component. Abbreviations are accepted. filt : array-like The filter coefficients for filtering out the seasonal component. The default is a symmetric moving average. freq : int, optional Frequency of the series. Must be used if x is not a pandas object with a timeseries index. Returns ------- results : obj A object with seasonal, trend, and resid attributes. Notes ----- This is a naive decomposition. More sophisticated methods should be preferred. The additive model is Y[t] = T[t] + S[t] + e[t] The multiplicative model is Y[t] = T[t] * S[t] * e[t] The seasonal component is first removed by applying a convolution filter to the data. The average of this smoothed series for each period is the returned seasonal component. See Also -------- statsmodels.tsa.filters.convolution_filter """ _pandas_wrapper, pfreq = _maybe_get_pandas_wrapper_freq(x) x = np.asanyarray(x).squeeze() nobs = len(x) if not np.all(np.isfinite(x)): raise ValueError("This function does not handle missing values") if model.startswith('m'): if np.any(x <= 0): raise ValueError("Multiplicative seasonality is not appropriate " "for zero and negative values") if pfreq is not None: pfreq = freq_to_period(pfreq) if freq and pfreq != freq: raise ValueError("Inferred frequency of index and frequency " "don't match. This function does not re-sample") else: freq = pfreq elif freq is None: raise ValueError("You must specify a freq or x must be a " "pandas object with a timeseries index") if filt is None: if freq % 2 == 0: # split weights at ends filt = np.array([.5] + [1] * (freq - 1) + [.5]) / freq else: filt = np.repeat(1./freq, freq) trend = convolution_filter(x, filt) # nan pad for conformability - convolve doesn't do it if model.startswith('m'): detrended = x / trend else: detrended = x - trend period_averages = seasonal_mean(detrended, freq) if model.startswith('m'): period_averages /= np.mean(period_averages) else: period_averages -= np.mean(period_averages) seasonal = np.tile(period_averages, nobs // freq + 1)[:nobs] if model.startswith('m'): resid = x / seasonal / trend else: resid = detrended - seasonal results = lmap(_pandas_wrapper, [seasonal, trend, resid, x]) return DecomposeResult(seasonal=results[0], trend=results[1], resid=results[2], observed=results[3])
def seasonal_decompose(x, model="additive", filt=None, freq=None, two_sided=True, extrapolate_trend=0): """ Seasonal decomposition using moving averages Parameters ---------- x : array-like Time series. If 2d, individual series are in columns. model : str {"additive", "multiplicative"} Type of seasonal component. Abbreviations are accepted. filt : array-like The filter coefficients for filtering out the seasonal component. The concrete moving average method used in filtering is determined by two_sided. freq : int, optional Frequency of the series. Must be used if x is not a pandas object. Overrides default periodicity of x if x is a pandas object with a timeseries index. two_sided : bool The moving average method used in filtering. If True (default), a centered moving average is computed using the filt. If False, the filter coefficients are for past values only. extrapolate_trend : int or 'freq', optional If set to > 0, the trend resulting from the convolution is linear least-squares extrapolated on both ends (or the single one if two_sided is False) considering this many (+1) closest points. If set to 'freq', use `freq` closest points. Setting this parameter results in no NaN values in trend or resid components. Returns ------- results : obj A object with seasonal, trend, and resid attributes. Notes ----- This is a naive decomposition. More sophisticated methods should be preferred. The additive model is Y[t] = T[t] + S[t] + e[t] The multiplicative model is Y[t] = T[t] * S[t] * e[t] The seasonal component is first removed by applying a convolution filter to the data. The average of this smoothed series for each period is the returned seasonal component. See Also -------- statsmodels.tsa.filters.bk_filter.bkfilter statsmodels.tsa.filters.cf_filter.xffilter statsmodels.tsa.filters.hp_filter.hpfilter statsmodels.tsa.filters.convolution_filter """ if freq is None: _pandas_wrapper, pfreq = _maybe_get_pandas_wrapper_freq(x) else: _pandas_wrapper = _maybe_get_pandas_wrapper(x) pfreq = None x = np.asanyarray(x).squeeze() nobs = len(x) if not np.all(np.isfinite(x)): raise ValueError("This function does not handle missing values") if model.startswith('m'): if np.any(x <= 0): raise ValueError("Multiplicative seasonality is not appropriate " "for zero and negative values") if freq is None: if pfreq is not None: pfreq = freq_to_period(pfreq) freq = pfreq else: raise ValueError("You must specify a freq or x must be a " "pandas object with a timeseries index with " "a freq not set to None") if filt is None: if freq % 2 == 0: # split weights at ends filt = np.array([.5] + [1] * (freq - 1) + [.5]) / freq else: filt = np.repeat(1. / freq, freq) nsides = int(two_sided) + 1 trend = convolution_filter(x, filt, nsides) if extrapolate_trend == 'freq': extrapolate_trend = freq - 1 if extrapolate_trend > 0: trend = _extrapolate_trend(trend, extrapolate_trend + 1) if model.startswith('m'): detrended = x / trend else: detrended = x - trend period_averages = seasonal_mean(detrended, freq) if model.startswith('m'): period_averages /= np.mean(period_averages, axis=0) else: period_averages -= np.mean(period_averages, axis=0) seasonal = np.tile(period_averages.T, nobs // freq + 1).T[:nobs] if model.startswith('m'): resid = x / seasonal / trend else: resid = detrended - seasonal results = lmap(_pandas_wrapper, [seasonal, trend, resid, x]) return DecomposeResult(seasonal=results[0], trend=results[1], resid=results[2], observed=results[3])
## Try with a pandas series import pandas import scikits.timeseries as ts d1 = ts.Date(year=1700, freq='A') #NOTE: have to have yearBegin offset for annual data until parser rewrite #should this be up to the user, or should it be done in TSM init? #NOTE: not anymore, it's end of year now ts_dr = ts.date_array(start_date=d1, length=len(sunspots.endog)) pandas_dr = pandas.DateRange(start=d1.datetime, periods=len(sunspots.endog), timeRule='A@DEC') #pandas_dr = pandas_dr.shift(-1, pandas.datetools.yearBegin) dates = np.arange(1700, 1700 + len(sunspots.endog)) dates = ts.date_array(dates, freq='A') #sunspots = pandas.Series(sunspots.endog, index=dates) #NOTE: pandas only does business days for dates it looks like import datetime dt_dates = np.asarray(lmap(datetime.datetime.fromordinal, ts_dr.toordinal().astype(int))) sunspots = pandas.Series(sunspots.endog, index=dt_dates) #NOTE: pandas can't handle pre-1900 dates mod = AR(sunspots, freq='A') res = mod.fit(method='mle', maxlag=9) # some data for an example in Box Jenkins IBM = np.asarray([460, 457, 452, 459, 462, 459, 463, 479, 493, 490.]) w = np.diff(IBM) theta = .5
#for Python 3 compatibility # Seemingly Unrelated Regressions (SUR) Model # This example uses the subset of the Grunfeld data in Greene's Econometric # Analysis Chapter 14 (5th Edition) grun_data = sm.datasets.grunfeld.load() firms = [ 'General Motors', 'Chrysler', 'General Electric', 'Westinghouse', 'US Steel' ] #for Python 3 compatibility firms = lmap(asbytes, firms) grun_exog = grun_data.exog grun_endog = grun_data.endog # Right now takes SUR takes a list of arrays # The array alternates between the LHS of an equation and RHS side of an # equation # This is very likely to change grun_sys = [] for i in firms: index = grun_exog['firm'] == i grun_sys.append(grun_endog[index]) exog = grun_exog[index][['value', 'capital']].view(float).reshape(-1, 2) exog = sm.add_constant(exog, prepend=True) grun_sys.append(exog)
def anova_lm(*args, **kwargs): """ ANOVA table for one or more fitted linear models. Parameters ---------- args : fitted linear model results instance One or more fitted linear models scale : float Estimate of variance, If None, will be estimated from the largest model. Default is None. test : str {"F", "Chisq", "Cp"} or None Test statistics to provide. Default is "F". typ : str or int {"I","II","III"} or {1,2,3} The type of ANOVA test to perform. See notes. robust : {None, "hc0", "hc1", "hc2", "hc3"} Use heteroscedasticity-corrected coefficient covariance matrix. If robust covariance is desired, it is recommended to use `hc3`. Returns ------- anova : DataFrame A DataFrame containing. Notes ----- Model statistics are given in the order of args. Models must have been fit using the formula api. See Also -------- model_results.compare_f_test, model_results.compare_lm_test Examples -------- >>> import statsmodels.api as sm >>> from statsmodels.formula.api import ols >>> moore = sm.datasets.get_rdataset("Moore", "car", ... cache=True) # load data >>> data = moore.data >>> data = data.rename(columns={"partner.status" : ... "partner_status"}) # make name pythonic >>> moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)', ... data=data).fit() >>> table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 ANOVA DataFrame >>> print table """ typ = kwargs.get('typ', 1) ### Farm Out Single model ANOVA Type I, II, III, and IV ### if len(args) == 1: model = args[0] return anova_single(model, **kwargs) try: assert typ in [1,"I"] except: raise ValueError("Multiple models only supported for type I. " "Got type %s" % str(typ)) ### COMPUTE ANOVA TYPE I ### # if given a single model if len(args) == 1: return anova_single(*args, **kwargs) # received multiple fitted models test = kwargs.get("test", "F") scale = kwargs.get("scale", None) n_models = len(args) model_formula = [] pr_test = "Pr(>%s)" % test names = ['df_resid', 'ssr', 'df_diff', 'ss_diff', test, pr_test] table = DataFrame(np.zeros((n_models, 6)), columns = names) if not scale: # assume biggest model is last scale = args[-1].scale table["ssr"] = lmap(getattr, args, ["ssr"]*n_models) table["df_resid"] = lmap(getattr, args, ["df_resid"]*n_models) table.ix[1:, "df_diff"] = -np.diff(table["df_resid"].values) table["ss_diff"] = -table["ssr"].diff() if test == "F": table["F"] = table["ss_diff"] / table["df_diff"] / scale table[pr_test] = stats.f.sf(table["F"], table["df_diff"], table["df_resid"]) # for earlier scipy - stats.f.sf(np.nan, 10, 2) -> 0 not nan table[pr_test][table['F'].isnull()] = np.nan return table
def anova_lm(*args, **kwargs): """ Anova table for one or more fitted linear models. Parameters ---------- args : fitted linear model results instance One or more fitted linear models scale : float Estimate of variance, If None, will be estimated from the largest model. Default is None. test : str {"F", "Chisq", "Cp"} or None Test statistics to provide. Default is "F". typ : str or int {"I","II","III"} or {1,2,3} The type of Anova test to perform. See notes. robust : {None, "hc0", "hc1", "hc2", "hc3"} Use heteroscedasticity-corrected coefficient covariance matrix. If robust covariance is desired, it is recommended to use `hc3`. Returns ------- anova : DataFrame A DataFrame containing. Notes ----- Model statistics are given in the order of args. Models must have been fit using the formula api. See Also -------- model_results.compare_f_test, model_results.compare_lm_test Examples -------- >>> import statsmodels.api as sm >>> from statsmodels.formula.api import ols >>> moore = sm.datasets.get_rdataset("Moore", "carData", cache=True) # load >>> data = moore.data >>> data = data.rename(columns={"partner.status" : ... "partner_status"}) # make name pythonic >>> moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)', ... data=data).fit() >>> table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 Anova DataFrame >>> print(table) """ typ = kwargs.get('typ', 1) ### Farm Out Single model Anova Type I, II, III, and IV ### if len(args) == 1: model = args[0] return anova_single(model, **kwargs) try: assert typ in [1, "I"] except: raise ValueError("Multiple models only supported for type I. " "Got type %s" % str(typ)) test = kwargs.get("test", "F") scale = kwargs.get("scale", None) n_models = len(args) pr_test = "Pr(>%s)" % test names = ['df_resid', 'ssr', 'df_diff', 'ss_diff', test, pr_test] table = DataFrame(np.zeros((n_models, 6)), columns=names) if not scale: # assume biggest model is last scale = args[-1].scale table["ssr"] = lmap(getattr, args, ["ssr"] * n_models) table["df_resid"] = lmap(getattr, args, ["df_resid"] * n_models) table.loc[table.index[1:], "df_diff"] = -np.diff(table["df_resid"].values) table["ss_diff"] = -table["ssr"].diff() if test == "F": table["F"] = table["ss_diff"] / table["df_diff"] / scale table[pr_test] = stats.f.sf(table["F"], table["df_diff"], table["df_resid"]) # for earlier scipy - stats.f.sf(np.nan, 10, 2) -> 0 not nan table[pr_test][table['F'].isnull()] = np.nan return table
def handle_missing(cls, endog, exog, missing, **kwargs): """ This returns a dictionary with keys endog, exog and the keys of kwargs. It preserves Nones. """ none_array_names = [] # patsy's already dropped NaNs in y/X missing_idx = kwargs.pop('missing_idx', None) if missing_idx is not None: # y, X already handled by patsy. add back in later. combined = () combined_names = [] if exog is None: none_array_names += ['exog'] elif exog is not None: combined = (endog, exog) combined_names = ['endog', 'exog'] else: combined = (endog,) combined_names = ['endog'] none_array_names += ['exog'] # deal with other arrays combined_2d = () combined_2d_names = [] if len(kwargs): for key, value_array in iteritems(kwargs): if value_array is None or value_array.ndim == 0: none_array_names += [key] continue # grab 1d arrays if value_array.ndim == 1: combined += (np.asarray(value_array),) combined_names += [key] elif value_array.squeeze().ndim == 1: combined += (np.asarray(value_array),) combined_names += [key] # grab 2d arrays that are _assumed_ to be symmetric elif value_array.ndim == 2: combined_2d += (np.asarray(value_array),) combined_2d_names += [key] else: raise ValueError("Arrays with more than 2 dimensions " "aren't yet handled") if missing_idx is not None: nan_mask = missing_idx updated_row_mask = None if combined: # there were extra arrays not handled by patsy combined_nans = _nan_rows(*combined) if combined_nans.shape[0] != nan_mask.shape[0]: raise ValueError("Shape mismatch between endog/exog " "and extra arrays given to model.") # for going back and updated endog/exog updated_row_mask = combined_nans[~nan_mask] nan_mask |= combined_nans # for updating extra arrays only if combined_2d: combined_2d_nans = _nan_rows(combined_2d) if combined_2d_nans.shape[0] != nan_mask.shape[0]: raise ValueError("Shape mismatch between endog/exog " "and extra 2d arrays given to model.") if updated_row_mask is not None: updated_row_mask |= combined_2d_nans[~nan_mask] else: updated_row_mask = combined_2d_nans[~nan_mask] nan_mask |= combined_2d_nans else: nan_mask = _nan_rows(*combined) if combined_2d: nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d) if not np.any(nan_mask): # no missing don't do anything combined = dict(zip(combined_names, combined)) if combined_2d: combined.update(dict(zip(combined_2d_names, combined_2d))) if none_array_names: combined.update(dict(zip(none_array_names, [None] * len(none_array_names)))) if missing_idx is not None: combined.update({'endog': endog}) if exog is not None: combined.update({'exog': exog}) return combined, [] elif missing == 'raise': raise MissingDataError("NaNs were encountered in the data") elif missing == 'drop': nan_mask = ~nan_mask drop_nans = lambda x: cls._drop_nans(x, nan_mask) drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask) combined = dict(zip(combined_names, lmap(drop_nans, combined))) if missing_idx is not None: if updated_row_mask is not None: updated_row_mask = ~updated_row_mask # update endog/exog with this new information endog = cls._drop_nans(endog, updated_row_mask) if exog is not None: exog = cls._drop_nans(exog, updated_row_mask) combined.update({'endog': endog}) if exog is not None: combined.update({'exog': exog}) if combined_2d: combined.update(dict(zip(combined_2d_names, lmap(drop_nans_2d, combined_2d)))) if none_array_names: combined.update(dict(zip(none_array_names, [None] * len(none_array_names)))) return combined, np.where(~nan_mask)[0].tolist() else: raise ValueError("missing option %s not understood" % missing)
def handle_missing(cls, endog, exog, missing, **kwargs): """ This returns a dictionary with keys endog, exog and the keys of kwargs. It preserves Nones. """ none_array_names = [] # patsy's already dropped NaNs in y/X missing_idx = kwargs.pop('missing_idx', None) if missing_idx is not None: # y, X already handled by patsy. add back in later. combined = () combined_names = [] if exog is None: none_array_names += ['exog'] elif exog is not None: combined = (endog, exog) combined_names = ['endog', 'exog'] else: combined = (endog, ) combined_names = ['endog'] none_array_names += ['exog'] # deal with other arrays combined_2d = () combined_2d_names = [] if len(kwargs): for key, value_array in iteritems(kwargs): if value_array is None or value_array.ndim == 0: none_array_names += [key] continue # grab 1d arrays if value_array.ndim == 1: combined += (np.asarray(value_array), ) combined_names += [key] elif value_array.squeeze().ndim == 1: combined += (np.asarray(value_array), ) combined_names += [key] # grab 2d arrays that are _assumed_ to be symmetric elif value_array.ndim == 2: combined_2d += (np.asarray(value_array), ) combined_2d_names += [key] else: raise ValueError("Arrays with more than 2 dimensions " "aren't yet handled") if missing_idx is not None: nan_mask = missing_idx updated_row_mask = None if combined: # there were extra arrays not handled by patsy combined_nans = _nan_rows(*combined) if combined_nans.shape[0] != nan_mask.shape[0]: raise ValueError("Shape mismatch between endog/exog " "and extra arrays given to model.") # for going back and updated endog/exog updated_row_mask = combined_nans[~nan_mask] nan_mask |= combined_nans # for updating extra arrays only if combined_2d: combined_2d_nans = _nan_rows(combined_2d) if combined_2d_nans.shape[0] != nan_mask.shape[0]: raise ValueError("Shape mismatch between endog/exog " "and extra 2d arrays given to model.") if updated_row_mask is not None: updated_row_mask |= combined_2d_nans[~nan_mask] else: updated_row_mask = combined_2d_nans[~nan_mask] nan_mask |= combined_2d_nans else: nan_mask = _nan_rows(*combined) if combined_2d: nan_mask = _nan_rows(*(nan_mask[:, None], ) + combined_2d) if not np.any(nan_mask): # no missing don't do anything combined = dict(zip(combined_names, combined)) if combined_2d: combined.update(dict(zip(combined_2d_names, combined_2d))) if none_array_names: combined.update( dict(zip(none_array_names, [None] * len(none_array_names)))) if missing_idx is not None: combined.update({'endog': endog}) if exog is not None: combined.update({'exog': exog}) return combined, [] elif missing == 'raise': raise MissingDataError("NaNs were encountered in the data") elif missing == 'drop': nan_mask = ~nan_mask drop_nans = lambda x: cls._drop_nans(x, nan_mask) drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask) combined = dict(zip(combined_names, lmap(drop_nans, combined))) if missing_idx is not None: if updated_row_mask is not None: updated_row_mask = ~updated_row_mask # update endog/exog with this new information endog = cls._drop_nans(endog, updated_row_mask) if exog is not None: exog = cls._drop_nans(exog, updated_row_mask) combined.update({'endog': endog}) if exog is not None: combined.update({'exog': exog}) if combined_2d: combined.update( dict( zip(combined_2d_names, lmap(drop_nans_2d, combined_2d)))) if none_array_names: combined.update( dict(zip(none_array_names, [None] * len(none_array_names)))) return combined, np.where(~nan_mask)[0].tolist() else: raise ValueError("missing option %s not understood" % missing)
def categorical(data, col=None, dictnames=False, drop=False): ''' Returns a dummy matrix given an array of categorical variables. Parameters ---------- data : array A structured array, recarray, array, Series or DataFrame. This can be either a 1d vector of the categorical variable or a 2d array with the column specifying the categorical variable specified by the col argument. col : {str, int, None} If data is a DataFrame col must in a column of data. If data is a Series, col must be either the name of the Series or None. If data is a structured array or a recarray, `col` can be a string that is the name of the column that contains the variable. For all other arrays `col` can be an int that is the (zero-based) column index number. `col` can only be None for a 1d array. The default is None. dictnames : bool, optional If True, a dictionary mapping the column number to the categorical name is returned. Used to have information about plain arrays. drop : bool Whether or not keep the categorical variable in the returned matrix. Returns ------- dummy_matrix, [dictnames, optional] A matrix of dummy (indicator/binary) float variables for the categorical data. If dictnames is True, then the dictionary is returned as well. Notes ----- This returns a dummy variable for EVERY distinct variable. If a a structured or recarray is provided, the names for the new variable is the old variable name - underscore - category name. So if the a variable 'vote' had answers as 'yes' or 'no' then the returned array would have to new variables-- 'vote_yes' and 'vote_no'. There is currently no name checking. Examples -------- >>> import numpy as np >>> import statsmodels.api as sm Univariate examples >>> import string >>> string_var = [string.ascii_lowercase[0:5], \ string.ascii_lowercase[5:10], \ string.ascii_lowercase[10:15], \ string.ascii_lowercase[15:20], \ string.ascii_lowercase[20:25]] >>> string_var *= 5 >>> string_var = np.asarray(sorted(string_var)) >>> design = sm.tools.categorical(string_var, drop=True) Or for a numerical categorical variable >>> instr = np.floor(np.arange(10,60, step=2)/10) >>> design = sm.tools.categorical(instr, drop=True) With a structured array >>> num = np.random.randn(25,2) >>> struct_ar = np.zeros((25,1), dtype=[('var1', 'f4'),('var2', 'f4'), \ ('instrument','f4'),('str_instr','a5')]) >>> struct_ar['var1'] = num[:,0][:,None] >>> struct_ar['var2'] = num[:,1][:,None] >>> struct_ar['instrument'] = instr[:,None] >>> struct_ar['str_instr'] = string_var[:,None] >>> design = sm.tools.categorical(struct_ar, col='instrument', drop=True) Or >>> design2 = sm.tools.categorical(struct_ar, col='str_instr', drop=True) ''' # TODO: add a NameValidator function if isinstance(col, (list, tuple)): if len(col) == 1: col = col[0] else: raise ValueError("Can only convert one column at a time") if (not isinstance(data, (pd.DataFrame, pd.Series)) and not isinstance(col, (string_types, int)) and col is not None): raise TypeError('col must be a str, int or None') # Pull out a Series from a DataFrame if provided if isinstance(data, pd.DataFrame): if col is None: raise TypeError('col must be a str or int when using a DataFrame') elif col not in data: raise ValueError('Column \'{0}\' not found in data'.format(col)) data = data[col] # Set col to None since we not have a Series col = None if isinstance(data, pd.Series): if col is not None and data.name != col: raise ValueError('data.name does not match col ' '\'{0}\''.format(col)) data_cat = data.astype('category') dummies = pd.get_dummies(data_cat) col_map = {i: cat for i, cat in enumerate(data_cat.cat.categories) if cat in dummies} if not drop: dummies.columns = list(dummies.columns) dummies = pd.concat([dummies, data], 1) if dictnames: return dummies, col_map return dummies # catch recarrays and structured arrays elif data.dtype.names or data.__class__ is np.recarray: if not col and np.squeeze(data).ndim > 1: raise IndexError("col is None and the input array is not 1d") if isinstance(col, (int, long)): col = data.dtype.names[col] if col is None and data.dtype.names and len(data.dtype.names) == 1: col = data.dtype.names[0] tmp_arr = np.unique(data[col]) # if the cols are shape (#,) vs (#,1) need to add an axis and flip _swap = True if data[col].ndim == 1: tmp_arr = tmp_arr[:, None] _swap = False tmp_dummy = (tmp_arr == data[col]).astype(float) if _swap: tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0) if not tmp_arr.dtype.names: # how do we get to this code path? tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr)] elif tmp_arr.dtype.names: tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr.tolist())] # prepend the varname and underscore, if col is numeric attribute # lookup is lost for recarrays... if col is None: try: col = data.dtype.names[0] except: col = 'var' # TODO: the above needs to be made robust because there could be many # var_yes, var_no varaibles for instance. tmp_arr = [col + '_' + item for item in tmp_arr] # TODO: test this for rec and structured arrays!!! if drop is True: if len(data.dtype) <= 1: if tmp_dummy.shape[0] < tmp_dummy.shape[1]: tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0) dt = lzip(tmp_arr, [tmp_dummy.dtype.str]*len(tmp_arr)) # preserve array type return np.array(lmap(tuple, tmp_dummy.tolist()), dtype=dt).view(type(data)) data = nprf.drop_fields(data, col, usemask=False, asrecarray=type(data) is np.recarray) data = nprf.append_fields(data, tmp_arr, data=tmp_dummy, usemask=False, asrecarray=type(data) is np.recarray) return data # Catch array-like for an error elif not isinstance(data, np.ndarray): raise NotImplementedError("Array-like objects are not supported") else: if isinstance(col, (int, long)): offset = data.shape[1] # need error catching here? tmp_arr = np.unique(data[:, col]) tmp_dummy = (tmp_arr[:, np.newaxis] == data[:, col]).astype(float) tmp_dummy = tmp_dummy.swapaxes(1, 0) if drop is True: offset -= 1 data = np.delete(data, col, axis=1).astype(float) data = np.column_stack((data, tmp_dummy)) if dictnames is True: col_map = _make_dictnames(tmp_arr, offset) return data, col_map return data elif col is None and np.squeeze(data).ndim == 1: tmp_arr = np.unique(data) tmp_dummy = (tmp_arr[:, None] == data).astype(float) tmp_dummy = tmp_dummy.swapaxes(1, 0) if drop is True: if dictnames is True: col_map = _make_dictnames(tmp_arr) return tmp_dummy, col_map return tmp_dummy else: data = np.column_stack((data, tmp_dummy)) if dictnames is True: col_map = _make_dictnames(tmp_arr, offset=1) return data, col_map return data else: raise IndexError("The index %s is not understood" % col)
def lstsq(a, b, cond=None, overwrite_a=0, overwrite_b=0): """Compute least-squares solution to equation :m:`a x = b` Compute a vector x such that the 2-norm :m:`|b - a x|` is minimised. Parameters ---------- a : array, shape (M, N) b : array, shape (M,) or (M, K) cond : float Cutoff for 'small' singular values; used to determine effective rank of a. Singular values smaller than rcond*largest_singular_value are considered zero. overwrite_a : boolean Discard data in a (may enhance performance) overwrite_b : boolean Discard data in b (may enhance performance) Returns ------- x : array, shape (N,) or (N, K) depending on shape of b Least-squares solution residues : array, shape () or (1,) or (K,) Sums of residues, squared 2-norm for each column in :m:`b - a x` If rank of matrix a is < N or > M this is an empty array. If b was 1-d, this is an (1,) shape array, otherwise the shape is (K,) rank : integer Effective rank of matrix a s : array, shape (min(M,N),) Singular values of a. The condition number of a is abs(s[0]/s[-1]). Raises LinAlgError if computation does not converge """ a1, b1 = lmap(asarray_chkfinite, (a, b)) if a1.ndim != 2: raise ValueError('expected matrix') m, n = a1.shape if b1.ndim == 2: nrhs = b1.shape[1] else: nrhs = 1 if m != b1.shape[0]: raise ValueError('incompatible dimensions') gelss, = get_lapack_funcs(('gelss', ), (a1, b1)) if n > m: # need to extend b matrix as it will be filled with # a larger solution matrix b2 = zeros((n, nrhs), dtype=gelss.dtype) if b1.ndim == 2: b2[:m, :] = b1 else: b2[:m, 0] = b1 b1 = b2 overwrite_a = overwrite_a or (a1 is not a and not hasattr(a, '__array__')) overwrite_b = overwrite_b or (b1 is not b and not hasattr(b, '__array__')) if gelss.module_name[:7] == 'flapack': # get optimal work array work = gelss(a1, b1, lwork=-1)[4] lwork = work[0].real.astype(np.int) v, x, s, rank, work, info = gelss(a1, b1, cond=cond, lwork=lwork, overwrite_a=overwrite_a, overwrite_b=overwrite_b) else: raise NotImplementedError('calling gelss from %s' % gelss.module_name) if info > 0: raise LinAlgError("SVD did not converge in Linear Least Squares") if info < 0: raise ValueError('illegal value in %-th argument of ' 'internal gelss' % -info) resids = asarray([], dtype=x.dtype) if n < m: x1 = x[:n] if rank == n: resids = sum(x[n:]**2, axis=0) x = x1 return x, resids, rank, s
import statsmodels.api as sm from statsmodels.sandbox.sysreg import * #for Python 3 compatibility # Seemingly Unrelated Regressions (SUR) Model # This example uses the subset of the Grunfeld data in Greene's Econometric # Analysis Chapter 14 (5th Edition) grun_data = sm.datasets.grunfeld.load() firms = ['General Motors', 'Chrysler', 'General Electric', 'Westinghouse', 'US Steel'] #for Python 3 compatibility firms = lmap(asbytes, firms) grun_exog = grun_data.exog grun_endog = grun_data.endog # Right now takes SUR takes a list of arrays # The array alternates between the LHS of an equation and RHS side of an # equation # This is very likely to change grun_sys = [] for i in firms: index = grun_exog['firm'] == i grun_sys.append(grun_endog[index]) exog = grun_exog[index][['value','capital']].view(float).reshape(-1,2) exog = sm.add_constant(exog, prepend=True) grun_sys.append(exog)
def fit_fr(self, data, *args, **kwds): '''estimate distribution parameters by MLE taking some parameters as fixed Parameters ---------- data : array, 1d data for which the distribution parameters are estimated, args : list ? check starting values for optimization kwds : - 'frozen' : array_like values for frozen distribution parameters and, for elements with np.nan, the corresponding parameter will be estimated Returns ------- argest : array estimated parameters Examples -------- generate random sample >>> np.random.seed(12345) >>> x = stats.gamma.rvs(2.5, loc=0, scale=1.2, size=200) estimate all parameters >>> stats.gamma.fit(x) array([ 2.0243194 , 0.20395655, 1.44411371]) >>> stats.gamma.fit_fr(x, frozen=[np.nan, np.nan, np.nan]) array([ 2.0243194 , 0.20395655, 1.44411371]) keep loc fixed, estimate shape and scale parameters >>> stats.gamma.fit_fr(x, frozen=[np.nan, 0.0, np.nan]) array([ 2.45603985, 1.27333105]) keep loc and scale fixed, estimate shape parameter >>> stats.gamma.fit_fr(x, frozen=[np.nan, 0.0, 1.0]) array([ 3.00048828]) >>> stats.gamma.fit_fr(x, frozen=[np.nan, 0.0, 1.2]) array([ 2.57792969]) estimate only scale parameter for fixed shape and loc >>> stats.gamma.fit_fr(x, frozen=[2.5, 0.0, np.nan]) array([ 1.25087891]) Notes ----- self is an instance of a distribution class. This can be attached to scipy.stats.distributions.rv_continuous *Todo* * check if docstring is correct * more input checking, args is list ? might also apply to current fit method ''' loc0, scale0 = lmap(kwds.get, ['loc', 'scale'],[0.0, 1.0]) Narg = len(args) if Narg == 0 and hasattr(self, '_fitstart'): x0 = self._fitstart(data) elif Narg > self.numargs: raise ValueError("Too many input arguments.") else: args += (1.0,)*(self.numargs-Narg) # location and scale are at the end x0 = args + (loc0, scale0) if 'frozen' in kwds: frmask = np.array(kwds['frozen']) if len(frmask) != self.numargs+2: raise ValueError("Incorrect number of frozen arguments.") else: # keep starting values for not frozen parameters x0 = np.array(x0)[np.isnan(frmask)] else: frmask = None #print(x0 #print(frmask return optimize.fmin(self.nnlf_fr, x0, args=(np.ravel(data), frmask), disp=0)