def test_studentT_likelihood(df: float, loc: float, scale: float):

    dfs = torch.zeros((NUM_SAMPLES, )) + df
    locs = torch.zeros((NUM_SAMPLES, )) + loc
    scales = torch.zeros((NUM_SAMPLES, )) + scale

    distr = StudentT(df=dfs, loc=locs, scale=scales)
    samples = distr.sample()

    init_bias = [
        inv_softplus(df - 2),
        loc - START_TOL_MULTIPLE * TOL * loc,
        inv_softplus(scale - START_TOL_MULTIPLE * TOL * scale),
    ]

    df_hat, loc_hat, scale_hat = maximum_likelihood_estimate_sgd(
        StudentTOutput(),
        samples,
        init_biases=init_bias,
        num_epochs=15,
        learning_rate=1e-3,
    )

    assert (np.abs(df_hat - df) <
            TOL * df), f"df did not match: df = {df}, df_hat = {df_hat}"
    assert (np.abs(loc_hat - loc) <
            TOL * loc), f"loc did not match: loc = {loc}, loc_hat = {loc_hat}"
    assert (np.abs(scale_hat - scale) < TOL * scale
            ), f"scale did not match: scale = {scale}, scale_hat = {scale_hat}"
def NIG_NLL(y: torch.Tensor,
            gamma: torch.Tensor,
            nu: torch.Tensor,
            alpha: torch.Tensor,
            beta: torch.Tensor,
            reduction='mean'):
    student_var = beta * (1. + alpha) / (nu * alpha)
    dist = StudentT(loc=gamma, scale=student_var, df=2 * alpha)
    nll = -1. * dist.log_prob(y)
    return reduce(nll, reduction=reduction)
Exemplo n.º 3
0
    def _reweight(self, N=100000):
        # Expect value: \mathbb{E}_{x~X}Ramp(|x|)
        if not hasattr(self, 'epv'):
            self.Hfunc = self.config.Hfunc
            # self.Hfunc = 'ramp'
            if self.real == 'Student':
                tdist = StudentT(df=self.config.r_df)
                x = tdist.sample((5000000, ))
            elif self.real == 'Gaussian':
                ndist = Normal(0, 1)
                x = ndist.sample((5000000, ))
            self.epv = self._HFunc(x, mode=self.Hfunc).mean().item()

        def sov_func(a, bs=1000):
            # find a suitable factor a to match expected value.
            r = AveMeter()
            for _ in range(N // bs):
                if self.config.use_ig:
                    ub1 = torch.randn(bs,
                                      self.netGXi.input_dim // 2).to(device)
                    ub2 = torch.randn(
                        bs, self.netGXi.input_dim -
                        self.netGXi.input_dim // 2).to(device)
                    ub2.data.div_(torch.abs(ub2.data) + self.config.delta)
                    ub = torch.cat([ub1, ub2], dim=1)
                else:
                    ub = torch.randn(bs, self.netGXi.input_dim).to(device)
                with torch.no_grad():
                    xib = self.netGXi(ub)
                zb = torch.randn(bs, self.dim).to(device)
                vu = (zb[:, 0].div_(zb.norm(2, dim=1)) +
                      self.config.delta).to(device)
                r.update(
                    self._HFunc(a * xib * vu, mode=self.Hfunc).mean().item(),
                    bs)
            return r.avg - self.epv

        # if sov_func(1) > 0: down,up= 0,3
        # elif sov_func(3) > 0: down,up = 0,5
        # elif sov_func(10) > 0: down,up = 1,12
        # elif sov_func(25) > 0: down,up = 8,27
        # elif sov_func(75) > 0: down,up = 23,77
        if sov_func(250) > 0:
            down, up = 0, 3000
        else:
            logger.info('Factor is larger than 2500!')
            return 250
        factor = bisect(sov_func, down, up)
        print(factor)
        return factor
Exemplo n.º 4
0
    def sample(self,
            batch_size=16,
            num_ctx=None,
            max_num_points=50,
            x_range=(-2, 2),
            device='cpu'):

        batch = AttrDict()
        num_ctx = num_ctx or torch.randint(low=3, high=max_num_points-3, size=[1]).item()
        num_tar = torch.randint(low=3, high=max_num_points-num_ctx, size=[1]).item()

        num_points = num_ctx + num_tar
        batch.x = x_range[0] + (x_range[1] - x_range[0]) \
                * torch.rand([batch_size, num_points, 1], device=device)
        batch.xc = batch.x[:,:num_ctx]
        batch.xt = batch.x[:,num_ctx:]

        # batch_size * num_points * num_points
        cov = self.kernel(batch.x)
        mean = torch.zeros(batch_size, num_points, device=device)
        batch.y = MultivariateNormal(mean, cov).rsample().unsqueeze(-1)
        batch.yc = batch.y[:,:num_ctx]
        batch.yt = batch.y[:,num_ctx:]

        if self.t_noise is not None:
            batch.y += self.t_noise * StudentT(2.1).rsample(batch.y.shape).to(device)
        return batch
Exemplo n.º 5
0
def img_to_task(img, num_ctx=None,
        max_num_points=None, target_all=False, t_noise=None, device=None):

    B, C, H, W = img.shape
    num_pixels = H*W
    img = img.view(B, C, -1)

    if t_noise is not None:
        if t_noise == -1:
            t_noise = 0.09 * torch.rand(img.shape)
        img += t_noise * StudentT(2.1).rsample(img.shape)

    device = img.device if device is None else device

    batch = AttrDict()
    max_num_points = max_num_points or num_pixels
    num_ctx = num_ctx or \
            torch.randint(low=3, high=max_num_points-3, size=[1]).item()
    num_tar = max_num_points - num_ctx if target_all else \
            torch.randint(low=3, high=max_num_points-num_ctx, size=[1]).item()
    num_points = num_ctx + num_tar
    idxs = torch.rand(B, num_pixels).argsort(-1)[...,:num_points].to(img.device)
    x1, x2 = idxs//W, idxs%W
    batch.x = torch.stack([
        2*x1.float()/(H-1) - 1,
        2*x2.float()/(W-1) - 1], -1).to(device)
    batch.y = (torch.gather(img, -1, idxs.unsqueeze(-2).repeat(1, C, 1))\
            .transpose(-2, -1) - 0.5).to(device)

    batch.xc = batch.x[:,:num_ctx]
    batch.xt = batch.x[:,num_ctx:]
    batch.yc = batch.y[:,:num_ctx]
    batch.yt = batch.y[:,num_ctx:]

    return batch
Exemplo n.º 6
0
class EpsiSampler:
    def __init__(self, x, epsi_nu):
        self.x = x
        self.len = self.x.shape[0]
        self.epsi_nu = epsi_nu
        self.tdistribution = StudentT(self.epsi_nu)

    def epsisamp(self, epsi, tau, mu):
        # assumes no covariance between epsilons; does not sample as a single block

        # Newton-Raphson iterations to find proposal density
        mu_f, hf, hf_inv = self.epsi_nr(epsi, mu, tau)

        # now propose with multivariate t centered at epsiMLE with covariance matrix from Hessian
        # note that since Hessian is diagonal, we can just simulate from n univariate t's.

        epsi_p = mu_f + hf_inv.neg().sqrt() * self.tdistribution.sample(
            torch.Size([self.len, 1]))
        # epsi_p = torch.randn(mu_f, -hf_inv)
        arat = self.pratepsi(epsi, epsi_p, tau, mu) + \
               tqrat(epsi, epsi_p, mu_f, mu_f, hf_inv.neg().sqrt(), hf_inv.neg().sqrt(), self.epsi_nu)

        ridx = torch.rand(self.len, 1).log() >= arat.clamp(max=0)
        ridx_float = ridx.type(torch.float32)

        epsi[~ridx] = epsi_p[~ridx]
        mrej = (1 - ridx_float).mean()
        return epsi, mrej

    # TODO: find out if .exp() legal here
    def pratepsi(self, epsi, epsi_p, tau, mu):
        pr = epsi_p * self.x / tau.sqrt() - (mu + epsi_p / tau.sqrt()).exp() - epsi_p ** 2 / 2 - \
             (epsi * self.x / tau.sqrt() - (mu + epsi / tau.sqrt()).exp() - epsi ** 2 / 2)
        return pr

    def epsi_nr(self, epsi, mu, tau):
        h, h_inv = 0, 0

        for i in range(1, 100):
            h, h_inv = self.hessepsi(epsi, tau, mu)

            # N - R update
            grad = self.gradepsi(epsi, tau, mu)
            epsi = epsi - h_inv * grad

            # we've reached a local maximum
            if grad.norm() < 1e-6:
                break
        return epsi, h, h_inv

    @staticmethod
    def hessepsi(epsi, tau, mu):
        h = -(mu + epsi / tau.sqrt()).exp() / tau - 1
        h_inv = 1 / h
        return h, h_inv

    def gradepsi(self, epsi, tau, mu):
        gr = self.x / tau.sqrt() - (
            mu + epsi / torch.sqrt(tau)).exp() / tau.sqrt() - epsi
        return gr
Exemplo n.º 7
0
 def forward(self):
     self.precision_coeff = (self.belief +
                             1) / (self.belief *
                                   (self.df - self.dimensionality + 1))
     return StudentT(
         (self.df - self.dimensionality + 1).unsqueeze(-1),
         loc=self.loc,
         scale=(self.precision_coeff.unsqueeze(-1) /
                self.precision_diag).pow(0.5),
     )
Exemplo n.º 8
0
 def forward(self):
     """Returns predictive posterior distribution"""
     self.precision_coeff = (self.belief +
                             1) / (self.belief *
                                   (self.df - self.dimensionality + 1))
     return StudentT(
         (self.df - self.dimensionality + 1).unsqueeze(-1),
         loc=self.loc,
         scale=(self.precision_coeff.unsqueeze(-1) /
                self.precision_diag).pow(0.5),
     )
Exemplo n.º 9
0
    def distribution(self,
                     distr_args,
                     scale: Optional[torch.Tensor] = None) -> Distribution:
        mix_logits, df, loc, dist_scale = distr_args

        distr = MixtureSameFamily(Categorical(logits=mix_logits),
                                  StudentT(df, loc, dist_scale))
        if scale is None:
            return distr
        else:
            return TransformedDistribution(
                distr, [AffineTransform(loc=0, scale=scale)])
Exemplo n.º 10
0
    def distribution(self,
                     distr_args,
                     scale: Optional[torch.Tensor] = None) -> Distribution:
        mix_logits, df, loc, scale = distr_args

        comp_distr = StudentT(df, loc, scale)
        if scale is None:
            return MixtureSameFamily(Categorical(logits=mix_logits),
                                     comp_distr)
        else:
            scaled_comp_distr = TransformedDistribution(
                comp_distr, [AffineTransform(loc=0, scale=scale)])
            return MixtureSameFamily(Categorical(logits=mix_logits),
                                     scaled_comp_distr)
Exemplo n.º 11
0
    def sample(self,
            bx,
            device='cuda:0'):
        # bx: 1 * num_points * 1

        # 1 * num_points * num_points
        cov = self.kernel(bx)
        mean = torch.zeros(1, bx.shape[1], device=device)
        mean = mean.cuda()

        by = MultivariateNormal(mean, cov).rsample().unsqueeze(-1)

        if self.t_noise is not None:
            by += self.t_noise * StudentT(2.1).rsample(by.shape).to(device)

        return by
Exemplo n.º 12
0
 def test_smoothing(self):
     # we should be able to run the kalman smoother over pretty much any
     # parameters without it blowing up
     fll = FilteredLocalLevelModel(input_length=50)
     true_params = dict(γ=0., η=2., ρ=0.95, σ=1.5)
     algo_seed, data_seed = 123, 123
     torch.manual_seed(data_seed)
     y, z = fll.simulate(**true_params)
     for i in range(10):
         ζ = StudentT(df=4, loc=0, scale=10).sample((fll.d, ))
         sm = fll.kalman_smoother(y, ζ)
         for k in [
                 "z_upd", "Σz_upd", "z_smooth", "Σz_smooth", "y_pred",
                 "Σy_pred"
         ]:
             self.assertIsInstance(sm[k], torch.Tensor)
             self.assertFalse(any(torch.isnan(sm[k])))
Exemplo n.º 13
0
def test_log_prob(batch_shape, dim):
    loc = torch.randn(batch_shape + (dim, ))
    A = torch.randn(batch_shape + (dim, dim + dim))
    scale_tril = A.matmul(A.transpose(-2, -1)).cholesky()
    x = torch.randn(batch_shape + (dim, ))
    df = torch.randn(batch_shape).exp() + 2
    actual_log_prob = MultivariateStudentT(df, loc, scale_tril).log_prob(x)

    if dim == 1:
        expected_log_prob = StudentT(df.unsqueeze(-1), loc,
                                     scale_tril[..., 0]).log_prob(x).sum(-1)
        assert_equal(actual_log_prob, expected_log_prob)

    # test the fact MVT(df, loc, scale)(x) = int MVN(loc, scale / m)(x) Gamma(df/2,df/2)(m) dm
    num_samples = 100000
    gamma_samples = Gamma(df / 2, df / 2).sample(sample_shape=(num_samples, ))
    mvn_scale_tril = scale_tril / gamma_samples.sqrt().unsqueeze(-1).unsqueeze(
        -1)
    mvn = MultivariateNormal(loc, scale_tril=mvn_scale_tril)
    expected_log_prob = mvn.log_prob(x).logsumexp(0) - math.log(num_samples)
    assert_equal(actual_log_prob, expected_log_prob, prec=0.01)
Exemplo n.º 14
0
def student_parse_params(params, min_sigma=0, min_nu=3.0, multiple=False):
    """
    Take a Tensor (e. g. neural network output) and return
    torch.distributions.Normal distribution.
    This Normal distribution is component-wise independent,
    and its dimensionality depends on the input shape.
    First half of channels is mean of the distribution,
    the softplus of the second half is std (sigma), so there is
    no restrictions on the input tensor.

    min_sigma is the minimal value of sigma. I. e. if the above
    softplus is less than min_sigma, then sigma is clipped
    from below with value min_sigma. This regularization
    is required for the numerical stability and may be considered
    as a neural network architecture choice without any change
    to the probabilistic model.
    """
    if multiple:
        batch_size = params.shape[0]
        n = params.shape[1]
        d = params.shape[2]
        mu = params[:, :, :d // 3]
        sigma_params = params[:, :, d // 3:2 * d // 3]
        nu_params = params[:, :, 2 * d // 3:]

    else:
        n = params.shape[0]
        d = params.shape[1]
        mu = params[:, :d // 3]
        sigma_params = params[:, d // 3:2 * d // 3]
        nu_params = params[:, 2 * d // 3:]

    sigma = softplus(sigma_params).clamp(min=min_sigma)
    nu = softplus(nu_params).clamp(min=min_nu)
    distr = StudentT(nu, loc=mu, scale=sigma)
    return distr
Exemplo n.º 15
0
 def entropy(self):
     simple_tst = StudentT_torch(self.df)
     H = self.coeff * torch.logdet(self.S) + self.d * simple_tst.entropy()
     return H
Exemplo n.º 16
0
 def __init__(self, x, epsi_nu):
     self.x = x
     self.len = self.x.shape[0]
     self.epsi_nu = epsi_nu
     self.tdistribution = StudentT(self.epsi_nu)