def test_16(self, obs_lognorm_obscorr): like = leopy.Likelihood(obs_lognorm_obscorr, p_true='lognorm', p_cond='norm') loc_true = [-0.02, 1.95] scale_true = [1.2, 2.9] shape_true = np.array([0.5, 1.44]).reshape(2, 1) rho = 0.54 R = np.array([[1., rho], [rho, 1.]]) p_all = like.p(loc_true, scale_true, shape_true=shape_true, R_true=R, pool=self.pool) N = obs_lognorm_obscorr.df.shape[0] p_all2 = np.zeros(N) for i in range(N): obs = leopy.Observation(obs_lognorm_obscorr.df.iloc[i:i + 1], 'test', verbosity=0) like = leopy.Likelihood(obs, p_true='lognorm', p_cond='norm') p_all2[i] = like.p(loc_true, scale_true, shape_true=shape_true, R_true=R, pool=self.pool) assert np.all(np.isclose(p_all.reshape(N), p_all2))
def obs_lognorm_obscorr(): np.random.seed(19) dist = scipy.stats.lognorm Ndata = 100 rho = 0.5 R = np.array([[1., rho], [rho, 1.]]) loc_true = np.array([0., 2.]) scale_true = np.array([1., 3.]) shape_true = np.array([0.5, 1.5]) x = scipy.stats.multivariate_normal.rvs(cov=R, size=Ndata) y = dist.ppf(scipy.stats.norm.cdf(x), shape_true, loc=loc_true, scale=scale_true) sigma_c = [0.1, 0.2] ey = np.zeros_like(y) ey[:, 0] = sigma_c[0] ey[:, 1] = sigma_c[1] rho_c = np.zeros(Ndata) error_y = np.zeros_like(y) for i in range(Ndata): rho_c[i] = 0.99 * 2 * (np.random.rand() - 0.5) R_c = np.array([[1., rho_c[i]], [rho_c[i], 1.]]) cov_c = np.diag(sigma_c).dot(R_c.dot(np.diag(sigma_c))) error_y[i, :] = scipy.stats.multivariate_normal.rvs(cov=cov_c) y += error_y df = pd.DataFrame(np.array([y[:, 0], y[:, 1], ey[:, 0], ey[:, 1], rho_c]).T, columns=['v0', 'v1', 'e_v0', 'e_v1', 'r_v0_v1']) return leopy.Observation(df, 'test', verbosity=0)
def obs_lognorm_MAR(): np.random.seed(2) dist = scipy.stats.lognorm Ndata = 100 rho = 0.5 R = np.array([[1., rho], [rho, 1.]]) loc_true = np.array([0., 2.]) scale_true = np.array([1., 3.]) shape_true = np.array([0.5, 1.5]) x = scipy.stats.multivariate_normal.rvs(cov=R, size=Ndata) y = dist.ppf(scipy.stats.norm.cdf(x), shape_true, loc=loc_true, scale=scale_true) y_true = np.copy(y) ey = np.zeros_like(y) ey[:, 0] = 0.2 ey[:, 1] = 0.1 y[:, 0] += ey[:, 0] * np.random.randn(Ndata) y[:, 1] += ey[:, 1] * np.random.randn(Ndata) def logistic(x): return np.exp(x) / (np.exp(x) + 1.) m1 = scipy.stats.bernoulli.rvs(logistic(y[:, 0] - 2.)).astype( bool) # for col 1 m0 = scipy.stats.bernoulli.rvs(logistic(y[:, 1] - 5.)).astype( bool) # for col 0 y[m1, 1] = np.float('NaN') y[m0, 0] = np.float('NaN') df = pd.DataFrame(np.array([y[:, 0], y[:, 1], ey[:, 0], ey[:, 1]]).T, columns=['v0', 'v1', 'e_v0', 'e_v1']) return leopy.Observation(df, 'test', verbosity=0)
def test_5(self): d = { 'v0': [1, 2], 'e_v0': [0.1, 0.2], 'v1': [3, 4], 'e_v1': [0.1, 0.1] } obs = leopy.Observation(pd.DataFrame(d), 'testdata', verbosity=0) like = leopy.Likelihood(obs, p_true='lognorm', verbosity=-1) p = like.p([0.5, 0.7], [1, 2], shape_true=[[1.4], [2.]], pool=self.pool) assert np.all(np.isclose(p, np.array([[0.0436189], [0.01067159]])))
def obs_norm_no_error(): np.random.seed(10) dist = scipy.stats.norm Ndata = 100 rho = 0.5 R = np.array([[1., rho], [rho, 1.]]) loc_true = [-1., 2.] scale_true = [1., 3.] x = scipy.stats.multivariate_normal.rvs(cov=R, size=Ndata) y = dist.ppf(scipy.stats.norm.cdf(x), loc=loc_true, scale=scale_true) y *= scale_true / np.std(y, axis=0) y += loc_true - np.mean(y, axis=0) ey = np.zeros_like(y) df = pd.DataFrame(np.array([y[:, 0], y[:, 1], ey[:, 0], ey[:, 1]]).T, columns=['v0', 'v1', 'e_v0', 'e_v1']) return leopy.Observation(df, 'test', verbosity=0)
def test_1(self): d = { 'v0': [1, 2], 'e_v0': [0.1, 0.2], 'v1': [3, 4], 'e_v1': [0.1, 0.1] } df = pd.DataFrame(d) obs = leopy.Observation(df, 'testdata', verbosity=0) like = leopy.Likelihood(obs, p_true='norm', verbosity=-1) stddev = [1, 2] mean = [0.5, 0.7] p = like.p(mean, stddev, pool=self.pool) p_v1 = scipy.stats.norm.pdf(df['v0'] - mean[0], scale=stddev[0]) p_v2 = scipy.stats.norm.pdf(df['v1'] - mean[1], scale=stddev[1]) assert np.all(np.isclose(p.T[0], p_v1 * p_v2))
def pdf(x, lgy): shape = x.shape x = x.flatten() lgy = lgy.flatten() obs = leopy.Observation({ 'v0': x, 'v1': 10**lgy }, 'true_pdf', verbosity=-1) like = leopy.Likelihood(obs, p_true='lognorm', p_cond=None, verbosity=-1) return (like.p( loc_true, scale_true, shape_true=shape_true, R_true=R) * 10**lgy[:, None] * np.log(10.)).reshape(shape)
def obs_lognorm_no_error(): np.random.seed(14) dist = scipy.stats.lognorm Ndata = 100 rho = 0.5 R = np.array([[1., rho], [rho, 1.]]) loc_true = np.array([0., 2.]) scale_true = np.array([1., 3.]) shape_true = np.array([0.5, 1.5]) x = scipy.stats.multivariate_normal.rvs(cov=R, size=Ndata) y = dist.ppf(scipy.stats.norm.cdf(x), shape_true, loc=loc_true, scale=scale_true) ey = np.zeros_like(y) df = pd.DataFrame(np.array([y[:, 0], y[:, 1], ey[:, 0], ey[:, 1]]).T, columns=['v0', 'v1', 'e_v0', 'e_v1']) return leopy.Observation(df, 'test', verbosity=0)
def test_18(self): v0 = [0.5, 2.0, 1.7, 1.1] ev0 = [0.1, 0.2, 0.3, 0.15] v1 = [3, 4, 5.2, 2.2] ev1 = [0.1, 0.1, 0.15, 0.12] v2 = [-2, 3, 1.7, 1.] ev2 = [0.2, 0.1, 0.05, 0.15] d = { 'v0': v0, 'e_v0': ev0, 'v1': v1, 'e_v1': ev1, 'v2': v2, 'e_v2': ev2 } obs = leopy.Observation(d, 'test', verbosity=0) like = leopy.Likelihood(obs, p_true=['lognorm', 'gamma', 'norm'], p_cond='norm') loc_true = [-0.02, 1.95, 1] scale_true = [0.7, 1.9, 2.5] shape_true = [[0.5], [2.03], []] p_0 = like.p(loc_true, scale_true, shape_true=shape_true, vars=[0], pool=self.pool) p_01 = like.p(loc_true, scale_true, shape_true=shape_true, vars=[0, 1], pool=self.pool) p_02 = like.p(loc_true, scale_true, shape_true=shape_true, vars=[0, 2], pool=self.pool) p_012 = like.p(loc_true, scale_true, shape_true=shape_true, pool=self.pool) assert np.all(np.isclose(p_01 / p_0 * p_02 / p_0, p_012 / p_0))
def test_3(self): d = { 'v0': [1., 2., -4.], 'e_v0': [0.1, 0.2, 0.3], 'v1': [3., 4., 1.], 'e_v1': [0.1, 0.1, 0.1] } df = pd.DataFrame(d) obs = leopy.Observation(df, 'testdata', verbosity=0) like = leopy.Likelihood(obs, p_true='norm', verbosity=-1) R = np.array([[1, -0.3], [-0.3, 1]]) stddev = [1, 2] mean = [0.5, 0.7] cov = np.diag(stddev).dot(R.dot(np.diag(stddev))) p = like.p(mean, stddev, R_true=R, pool=self.pool) p_v1v2 = scipy.stats.multivariate_normal.pdf(df[['v0', 'v1']], mean=mean, cov=cov) assert np.all(np.isclose(p.T[0], p_v1v2))
def pdf(x, lgy): shape = x.shape x = x.flatten() lgy = lgy.flatten() obs = leopy.Observation({ 'v0': x, 'v1': 10**lgy }, 'true_pdf', verbosity=-1) like = leopy.Likelihood(obs, p_true='lognorm', p_cond=None, verbosity=-1) return (like.p([0, 0], 10**ML_result[0:2], shape_true=10**ML_result[2:4], R_true=ML_R) * 10**lgy[:, None] * np.log(10.)).reshape(shape)
def obs_norm_cen(): np.random.seed(16) dist = scipy.stats.norm Ndata = 200 rho = 0.5 R = np.array([[1., rho], [rho, 1.]]) loc_true = np.array([0.5, 1.5]) scale_true = np.array([1., 2.5]) x = scipy.stats.multivariate_normal.rvs(cov=R, size=Ndata) y = dist.ppf(scipy.stats.norm.cdf(x), loc=loc_true, scale=scale_true) y_true = np.copy(y) ey = np.zeros_like(y) ey[:, 0] = 0.2 ey[:, 1] = 0.1 y[:, 0] += ey[:, 0] * np.random.randn(Ndata) y[:, 1] += ey[:, 1] * np.random.randn(Ndata) lower_limit = 0.3 * np.random.randn(Ndata) + 0.3 lower_limit[lower_limit < 0.05] = 0.05 upper_limit = 0.6 * np.random.randn(Ndata) + 2.5 upper_limit[upper_limit < 0.2] = 0.2 cy = np.zeros_like(y).astype(bool) ly = -np.infty * np.ones_like(y) uy = np.infty * np.ones_like(y) for i in range(2): sel = y[:, i] < lower_limit y[sel, i] = float('NaN') cy[sel, i] = True ly[sel, i] = lower_limit[sel] sel = y[:, i] > upper_limit y[sel, i] = float('NaN') cy[sel, i] = True uy[sel, i] = upper_limit[sel] df = pd.DataFrame(np.array([ y[:, 0], y[:, 1], ey[:, 0], ey[:, 1], cy[:, 0], cy[:, 1], ly[:, 0], ly[:, 1], uy[:, 0], uy[:, 1] ]).T, columns=[ 'v0', 'v1', 'e_v0', 'e_v1', 'c_v0', 'c_v1', 'l_v0', 'l_v1', 'u_v0', 'u_v1' ]) return leopy.Observation(df, 'test', verbosity=0)
def test_7(self): d = { 'v0': [1, 2], 'e_v0': [0.1, 0.2], 'v1': [3, 4], 'e_v1': [0.1, 0.1] } obs = leopy.Observation(pd.DataFrame(d), 'testdata', verbosity=0) like = leopy.Likelihood(obs, p_true='lognorm', p_cond='norm', verbosity=-1) p = like.p([0.5, 0.7], [1, 2], shape_true=[[1.4], [2.]], pool=self.pool) assert np.all( np.isclose(p, np.array([[0.04415356], [0.01089342]]), rtol=1e-5, atol=1e-5))
def test_8(self): d = { 'v0': [1., 2., 0.8], 'e_v0': [1e-6, 1e-6, 1e-6], 'v1': [3., 4., 1.], 'e_v1': [1e-6, 1e-6, 1e-6] } df = pd.DataFrame(d) obs = leopy.Observation(df, 'testdata', verbosity=0) like = leopy.Likelihood(obs, p_true='lognorm', p_cond='norm', verbosity=-1) R = np.array([[1, -0.3], [-0.3, 1]]) scale = [1, 2] loc = [0.5, 0.] shape = [[1], [1.5]] p = like.p(loc, scale, shape_true=shape, R_true=R, pool=self.pool) assert np.all( np.isclose(p, np.array([[0.05819145], [0.01415945], [0.12375991]]), rtol=1e-5, atol=1e-5))
def __init__(self, obs, p_true='norm', p_cond=None, verbosity=0, **kwargs): r"""Initialize self. Parameters ---------- obs : object or int Instance of class `Observation` containing the observed data or the number of observables per data point. p_true : object or str or None Instance of class `scipy.stats.rv_continuous` describing the probability distribution of the true values of a given observable. The member functions _pdf(), _cdf(), and _ppf() are used during the likelihood calculation. If `p_true` is a string, it assumes it is a function of the same name defined in in scipy.stats. `p_true` can also be a list or tuple or instances of class `scipy.stats.rv_continuous`, one per given observable. If set to None, the probability density is assumed to be a delta function (the default is 'norm'). p_cond : object or str or None Same as `p_true` but describing the conditional probability distribution of the observed values given the true values of a given observable. If set to None, the conditional probability density is assumed to be a delta function, i.e., p_obs = p_true (the default is None). verbosity : int Level of verbosity (default is 0, i.e., no additional output) **kwargs : type `**kwargs` is passed to an instance of `Convolution`. Returns ------- Likelihood Instance of class `Likelihood`. Examples -------- >>> import pandas as pd >>> from leopy import Observation, Likelihood >>> d = {'v0': [1, 2], 'e_v0': [0.1, 0.2], ... 'v1': [3, 4], 'e_v1': [0.1, 0.1]} >>> obs = Observation(pd.DataFrame(d), 'testdata') Reading dataset 'testdata' and extracting 2 variables (['v0', 'v1']) Errors of different observables are assumed to be uncorrelated >>> l = Likelihood(obs, p_true='lognorm', p_cond='norm') >>> l.p([0.5, 0.7], [1, 2], shape_true=[[1.4], [2.]]) array([[0.04415447], [0.01089338]]) """ try: num_var = obs.num_var self.obs = obs except AttributeError: num_var = obs self.obs = leopy.Observation(np.zeros((0, num_var)), 'empty', verbosity=0) self.verbosity = verbosity # set self.p_true & self.p_cond for ip, p in enumerate([p_true, p_cond]): if type(p) in [list, tuple]: assert len(p) == num_var sp = [] for _p in p: if type(_p) == str: sp.append(eval('scipy.stats.{}'.format(_p))) else: sp.append(_p) else: if type(p) == str: sp = [eval('scipy.stats.{}'.format(p))] * num_var else: sp = [p] * num_var if ip == 0: self.p_true = sp elif ip == 1: self.p_cond = sp # set self.p_obs self.p_obs = [] for var in range(num_var): if self.p_cond[var] is None: if self.p_true[var] is None: self.p_true[var] = scipy.stats.norm(scale=1e-30) self.p_true[var].numargs = 0 self.p_true[var].name = 'norm' print( 'p_cond and p_true are both set to delta functions - ' 'adjusting p_true to be normal with scale of 1e-30.') self.p_obs.append(self.p_true[var]) else: self.p_obs.append(self.p_true[var]) else: if self.p_true[var] is None: self.p_obs.append(self.p_cond[var]) else: self.p_obs.append( leopy.Convolution(self.p_cond[var], self.p_true[var], verbosity=verbosity, **kwargs))
h, = plt.plot([min_x, max_x], m * np.array([min_x, max_x]) + n, '--k', lw=2, alpha=1., zorder=2) hs.append(h) ls.append('true data w/o intrinsic scatter') ## -- linear regression (Maximum likelihood with leopy) import leopy df = pd.DataFrame(np.array([x, y, uncert_x, uncert_y]).T, columns=['v0', 'v1', 'e_v0', 'e_v1']) obs = leopy.Observation(df, 'test', verbosity=0) ## -- set up Likelihood and find maximum likelihood parameters like = leopy.Likelihood(obs, p_true='norm', p_cond=[None, 'norm'], verbosity=-1) def f_lnlike(p, pool): print(p) # p are the three parameters of the fit # the slope (p[0]) # the intercept (p[1]) # and the intrinsic scatter (p[2])
labelsize=fontsize - 6) plt.minorticks_on() plt.tight_layout() plt.legend(hs, ls, loc='upper right', markerfirst=False, handletextpad=-0.1) if savefig: plt.savefig('joint_probability_{}.pdf'.format(irun)) if optimize: obs = leopy.Observation(df, 'joint probability') ## -- set up Likelihood and find maximum likelihood parameters like = leopy.Likelihood(obs, p_true='lognorm', p_cond='norm', verbosity=0) if correlated: def f_mlnlike(x, *args): if np.any(np.isnan(x)): return 1000. df = args[0]
def test_17(self): v0 = [0.5, 2.0, 1.7] ev0 = [0.1, 0.2, 0.3] v1 = [3, 4, 5.2] ev1 = [0.1, 0.1, 0.15] rv0v1 = [0.2, 0.8, -0.8] d = {'v0': v0, 'e_v0': ev0, 'v1': v1, 'e_v1': ev1, 'r_v0_v1': rv0v1} obs = leopy.Observation(d, 'test', verbosity=0) like = leopy.Likelihood(obs, p_true='lognorm', p_cond='norm') loc_true = [-0.02, 1.95] scale_true = [0.7, 1.9] shape_true = np.array([0.5, 2.03]).reshape(2, 1) rho = 0.0 R = np.array([[1., rho], [rho, 1.]]) p_x = like.p(loc_true, scale_true, shape_true=shape_true, R_true=R, vars=[0], pool=self.pool) p_y = like.p(loc_true, scale_true, shape_true=shape_true, R_true=R, vars=[1], pool=self.pool) import scipy.integrate N = 2000 xx = np.concatenate([ -np.logspace(1, -5, N // 5) + loc_true[0], [loc_true[0]], np.logspace(-5, 4, N - N // 5 - 1) + loc_true[0] ]) yy = np.concatenate([ -np.logspace(1, -5, N // 5) + loc_true[1], [loc_true[1]], np.logspace(-5, 4, N - N // 5 - 1) + loc_true[1] ]) d_x = { 'v0': np.outer(v0, np.ones(N)).flatten(), 'e_v0': np.outer(ev0, np.ones(N)).flatten(), 'v1': np.outer(np.ones(3), yy).flatten(), 'e_v1': np.outer(ev1, np.ones(N)).flatten(), 'r_v0_v1': np.outer(rv0v1, np.ones(N)).flatten() } obs_x = leopy.Observation(d_x, 'test', verbosity=0) like_x = leopy.Likelihood(obs_x, p_true='lognorm', p_cond='norm') res = like_x.p(loc_true, scale_true, shape_true=shape_true, R_true=R, pool=self.pool) res = res.reshape(3, N) p_x_2 = scipy.integrate.trapz(res, yy) assert np.all(np.isclose(p_x.reshape(3), p_x_2, atol=1e-4)) d_y = { 'v0': np.outer(np.ones(3), xx).flatten(), 'e_v0': np.outer(ev0, np.ones(N)).flatten(), 'v1': np.outer(v1, np.ones(N)).flatten(), 'e_v1': np.outer(ev1, np.ones(N)).flatten(), 'r_v0_v1': np.outer(rv0v1, np.ones(N)).flatten() } obs_y = leopy.Observation(d_y, 'test', verbosity=0) like_y = leopy.Likelihood(obs_y, p_true='lognorm', p_cond='norm') res = like_y.p(loc_true, scale_true, shape_true=shape_true, R_true=R, pool=self.pool).reshape(3, N) p_y_2 = scipy.integrate.trapz(res, xx) assert np.all(np.isclose(p_y.reshape(3), p_y_2, atol=1e-4))
else: Ndet = np.sum(np.logical_not(np.isnan(df[ly]))) Ncen = 0 Nmis = np.sum(np.isnan(df[ly])) print('Data set: Ntotal = {}, Ndet = {}, Ncen = {}, Nmiss = {}'.format( df.shape[0], Ndet, Ncen, Nmis)) # downsampling for test purposes if 0: np.random.seed(2) df = df.sample(frac=0.1) print('Downsampling: Ntotal = {}, Ndet = {}, Ncen = {}, Nmiss = {}'.format( df.shape[0], Ndet, Ncen, Nmis)) # -- Step 2. Prepare LEO-Py obs = leopy.Observation(df, 'xGASS', variables=[lx, ly]) df = obs.df like = leopy.Likelihood(obs, p_true=['norm', leopy.stats.zi_gamma_lognorm], p_cond=[None, 'norm']) # -- Step 3. Prepare Maximum Likelihood analysis def f_mlnlike(x, pool): """Return minus log likelihood (rescaled).""" if np.any(np.isnan(x)): return 1000. Nobs = df.shape[0] t = df['v0'].to_numpy().reshape(Nobs, 1)
Likelihood Estimation of Observational data with Python Copyright 2019 University of Zurich, Robert Feldmann LEO-Py is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. LEO-Py is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with LEO-Py. If not, see <https://www.gnu.org/licenses/>. """ import leopy from schwimmbad import MultiPool pool = MultiPool() d = {'v0': [1, 2], 'e_v0': [0.1, 0.2], 'v1': [3, 4], 'e_v1': [0.1, 0.1]} obs = leopy.Observation(d, 'testdata') like = leopy.Likelihood(obs, p_true='gamma', p_cond='norm') print(like.p([0.5, 0.7], [1, 2], shape_true=[1.4, 2], pool=pool)) pool.close()