예제 #1
0
def test_case_control():
    configuration = {
        'n': 20000,
        'bias': [-3, 0, 0],
        'positive_sampling_weight': 10,
        's': 200,
        'seed': 3,
        'link': special.expit,
        'linkln': kernels.expitln,
    }

    data = kernels.simulate_data(**configuration)
    log_likelihood = ft.partial(
        kernels.evaluate_case_control_log_likelihood,
        x=data['x_observed'],
        y=data['y_observed'],
        prevalence=data['marginal'],
        linkln=data['linkln'],
    )

    # Find the maximum likelihood estimate
    theta = data['theta']
    x0 = np.random.normal(size=theta.shape)
    # Use mean aggregation to avoid https://stackoverflow.com/a/54446479/1150961
    result = optimize.minimize(lambda x: -log_likelihood(x, aggregate='mean'),
                               x0)
    assert result.success, 'optimization failed'

    # Draw samples and plot them
    cov = 2.4**2 * result.hess_inv / (len(data['x_observed']) * len(theta))
    xs, values = kernels.sample(log_likelihood, result.x, cov, 5000)
    _plot_inference(theta, xs, result, filename='tests/~case_control.png')
예제 #2
0
def test_network_inference():
    # Sample data
    configuration = {
        'n': 2000,
        'bias': [-7, 0, 0],
        'link': special.expit,
        'linkln': kernels.expitln,
        's': 100,
        'seed': 0,
        'feature_map': kernels.l1_feature_map
    }
    data = kernels.simulate_network_data(**configuration)

    # Evaluate an estimate of the prevalence in the population
    k = len(data['pairs']) / data['s']
    prevalence = k / (data['n'] - 1)

    # Get the features for cases
    i1, j1 = data['pairs'].T
    z = data['z']
    x_cases = data['feature_map'](z[i1], z[j1])

    # Sample controls and get their features
    i0, j0 = kernels.sample_controls(data['egos'], 3 * len(data['pairs']))
    x_controls = data['feature_map'](z[i0], z[j0])

    # Concatenate features and construct indicator variables
    x_observed = np.concatenate([x_cases, x_controls])
    y_observed = np.concatenate(
        [np.ones(len(x_cases)),
         np.zeros(len(x_controls))])

    log_likelihood = ft.partial(
        kernels.evaluate_case_control_log_likelihood,
        x=x_observed,
        y=y_observed,
        prevalence=prevalence,
        linkln=data['linkln'],
    )

    # Find the maximum likelihood estimate
    theta = data['theta']
    x0 = np.random.normal(size=theta.shape)
    # Use mean aggregation to avoid https://stackoverflow.com/a/54446479/1150961
    result = optimize.minimize(lambda x: -log_likelihood(x, aggregate='mean'),
                               x0)
    assert result.success, 'optimization failed'

    # Draw samples and plot them
    cov = 2.4**2 * result.hess_inv / (len(x_observed) * len(theta))
    xs, values = kernels.sample(log_likelihood, result.x, cov, 5000)
    _plot_inference(theta, xs, result, filename='tests/~network_inference.png')
예제 #3
0
def test_sample_invalid_arguments():
    def _log_dist(_):
        return 0
    x = 0
    cov = 1
    num = 10

    with pytest.raises(ValueError):
        kernels.sample(_log_dist, np.empty((1, 1)), cov, num)
    with pytest.raises(ValueError):
        kernels.sample(_log_dist, x, np.empty((1, 1, 1)), num)
    with pytest.raises(ValueError):
        kernels.sample(_log_dist, x, np.empty((1, 2)), num)
예제 #4
0
def test_sample():
    xs, values = kernels.sample(lambda x: - x ** 2 / 2, 0, 1, 10000)
    assert abs(np.mean(xs)) < 0.1
    assert abs(np.std(xs) - 1) < 0.1
예제 #5
0
                               x0)
    assert result.success, result.message
    # Evaluation based on numdifftools (should be more accurate)
    cov = -np.linalg.inv(ndt.Hessian(log_posterior)(result.x))
    logging.info('maximised posterior in %d function evaluations', result.nfev)
    logging.info('MAP estimate: %s', dict(zip(feature_names, result.x)))
    logging.info('approximate marginal std: %s',
                 dict(zip(feature_names, np.sqrt(np.diag(cov)))))

    # Draw samples from the log-posterior ----------------------------------------------------------
    # Use the inverse Hessian from the optimisation to construct an approximate "optimal" proposal
    # covariance following A. Gelman, G. O. Roberts, W. R. Gilks. "Efficient Metropolis jumping
    # rules". (1996)

    proposal_cov = 2.4**2 * cov / len(result.x)
    xs, values = kernels.sample(log_posterior, result.x, proposal_cov,
                                args.num_samples)
    acceptance = kernels.evaluate_acceptance(values)
    logging.info('obtained %d posterior samples with acceptance %.3f',
                 args.num_samples, acceptance)
    logging.info('posterior mean: %s',
                 dict(zip(feature_names, np.mean(xs, axis=0))))
    logging.info('posterior std: %s',
                 dict(zip(feature_names, np.std(xs, axis=0))))
    if 'theta' in data:
        logging.info('true values: %s', dict(zip(feature_names,
                                                 data['theta'])))
        residuals = np.mean(xs, axis=0) - data['theta']
        logging.info('z-scores: %s',
                     dict(zip(feature_names, residuals / np.std(xs, axis=0))))
        cov_ = np.cov(xs.T)
        chi2 = residuals.dot(np.linalg.inv(cov_)).dot(residuals)