def test_multinmoial_goodness_of_fit(): thresh = 1e-3 n = int(1e5) ds = [3, 10, 20] for d in ds: for _ in range(5): probs = np.random.dirichlet([1] * d) counts = np.random.multinomial(n, probs) p_good = mgof(probs, counts, n) assert_greater(p_good, thresh) unif_counts = np.random.multinomial(n, [1. / d] * d) p_bad = mgof(probs, unif_counts, n) assert_less(p_bad, thresh)
def check_dpm(impl, data_count, beta0): check_cm(impl) data = histogram(np.random.randint(50, size=data_count)) data = dict([(str(i), obs) for i, obs in enumerate(data)]) betas = dict([(str(i), (1 - beta0) / len(data)) for i, obs in enumerate(data)]) hp = { 'gamma': 1., 'alpha': 1., 'beta0': beta0, 'betas': betas } ss = {'counts': data} cm = ComponentModel( impl, ss=ss, hp=hp) samples = cm.sample_data(SAMPS) counts = list(histogram([y for y in samples if y != -1])) probs = list(np.exp([cm.pred_prob(x) for x in range(max(samples) + 1)])) counts.append(len([y for y in samples if y == -1])) probs.append(np.exp(cm.pred_prob(-1))) assert_less(1 - sum(probs), THRESH) probs, counts = zip(*sorted(zip(probs, counts), reverse=True)[:TOPN]) p = mgof(probs, counts, SAMPS, truncated=True) assert_greater(p, THRESH)
def _check_discrete(cm): samples = cm.sample_data(SAMPS) counts = histogram(samples) probs = np.exp([cm.pred_prob(x) for x in range(max(samples) + 1)]) assert_less(1 - sum(probs), THRESH) probs, counts = zip(*sorted(zip(probs, counts), reverse=True)[:TOPN]) p = mgof(probs, counts, SAMPS, truncated=True) assert_greater(p, THRESH)
def check_nich(impl, data_count, mean, std): check_cm(impl) ss = None if data_count: data = np.random.normal(mean, std, size=data_count) ss = {'count': data_count, 'mean': data.mean(), 'variance': data.var()} cm = ComponentModel(impl, ss=ss) samples = cm.sample_data(SAMPS) counts, bin_ranges = bin_samples(samples) #use of quadrature is unfortunate but for now #it's the easiest way to score bins and seems to work pdf = lambda x: np.exp(cm.pred_prob(x)) probs = [quad(pdf, m, M, epsabs=0., epsrel=1e-6)[0] for m, M in bin_ranges] assert_less(1 - sum(probs), THRESH) probs, counts = zip(*sorted(zip(probs, counts), reverse=True)[:TOPN]) p = mgof(probs, counts, SAMPS, truncated=True) assert_greater(p, THRESH)
def check_dpm(impl, data_count, beta0): check_cm(impl) data = histogram(np.random.randint(50, size=data_count)) data = dict([(str(i), obs) for i, obs in enumerate(data)]) betas = dict([(str(i), (1 - beta0) / len(data)) for i, obs in enumerate(data)]) hp = {'gamma': 1., 'alpha': 1., 'beta0': beta0, 'betas': betas} ss = {'counts': data} cm = ComponentModel(impl, ss=ss, hp=hp) samples = cm.sample_data(SAMPS) counts = list(histogram([y for y in samples if y != -1])) probs = list(np.exp([cm.pred_prob(x) for x in range(max(samples) + 1)])) counts.append(len([y for y in samples if y == -1])) probs.append(np.exp(cm.pred_prob(-1))) assert_less(1 - sum(probs), THRESH) probs, counts = zip(*sorted(zip(probs, counts), reverse=True)[:TOPN]) p = mgof(probs, counts, SAMPS, truncated=True) assert_greater(p, THRESH)
def check_nich(impl, data_count, mean, std): check_cm(impl) ss = None if data_count: data = np.random.normal(mean, std, size=data_count) ss = { 'count': data_count, 'mean': data.mean(), 'variance': data.var() } cm = ComponentModel(impl, ss=ss) samples = cm.sample_data(SAMPS) counts, bin_ranges = bin_samples(samples) #use of quadrature is unfortunate but for now #it's the easiest way to score bins and seems to work pdf = lambda x: np.exp(cm.pred_prob(x)) probs = [quad(pdf, m, M, epsabs=0., epsrel=1e-6)[0] for m, M in bin_ranges] assert_less(1 - sum(probs), THRESH) probs, counts = zip(*sorted(zip(probs, counts), reverse=True)[:TOPN]) p = mgof(probs, counts, SAMPS, truncated=True) assert_greater(p, THRESH)