示例#1
0
    def _density(self, x):
        cat_len = len(self._categoricals)
        num_len = len(self._numericals)
        cat = tuple(x[:cat_len])  # need it as a tuple for indexing below
        num = np.array(x[cat_len:])  # need as np array for dot product

        p = self._p.loc[cat].values

        if num_len == 0:
            return p

        # works because gaussian variables are - by design of this class - after categoricals.
        # Therefore the only not specified dimension is the last one, i.e. the one that holds the mean!
        mu = self._mu.loc[cat].values
        detS = self._detS.loc[cat].values
        invS = self._SInv.loc[cat].values
        xmu = num - mu
        gauss = (2 * pi)**(-num_len / 2) * detS * exp(
            -.5 * np.dot(xmu, np.dot(invS, xmu)))
        assert no_nan(gauss), "Density computation failed."

        if cat_len == 0:
            return gauss
        else:
            return p * gauss
示例#2
0
 def _fit(self):
     assert (self.mode != 'none')
     self._p, self._mu, self._S = fit_full(self.data, self.fields,
                                           self._categoricals,
                                           self._numericals)
     for o in [self._p, self._mu, self._S]:
         assert (no_nan(o))
     return self._unbound_updater,
示例#3
0
    def _conditionout_continuous_internal_fast(self, p_, mu_, detS_, S_,
                                               cond_values, i_names, j_names,
                                               all_num_removing):
        if all_num_removing:
            detS_cond = 1

        # get numerical index for mu, sigma
        num_map = self._name_idx_map(mode='num')
        i = [num_map[v] for v in i_names]
        j = [num_map[v] for v in j_names]

        n = p_.size  # number of single gaussians in the cg
        m = len(self._numericals)  # gaussian dimension

        # extract numpy arrays and reshape to suitable form. This allows to iterate over the single parameters for S, mu, p by means of a standard python iterators
        p_np = p_.values.reshape(n)
        mu_np = mu_.values.reshape(n, m)
        detS_np = detS_.values.reshape(n)
        S_np = S_.values.reshape(n, m, m)

        for (idx, (p, mu, detS,
                   S)) in enumerate(zip(p_np, mu_np, detS_np, S_np)):
            diff_y_mu_J = cond_values - mu[j]
            Sjj_inv = inv(S[ix_(j, j)])
            assert no_nan(Sjj_inv), "Inversion of Covariance Matrix failed."

            if not all_num_removing:
                # update Sigma and mu
                sigma_expr = np.dot(S[ix_(i, j)],
                                    Sjj_inv)  # reused below multiple times
                assert no_nan(sigma_expr), "Sigma_expr contains nan"
                S_np[idx][ix_(i,
                              i)] -= dot(sigma_expr,
                                         S[ix_(j,
                                               i)])  # upper Schur complement
                mu_np[idx][i] += dot(sigma_expr, diff_y_mu_J)
                # this is for p update. Otherwise it is constant and calculated before the stacking loop
                detS_cond = abs(det(S_np[idx][ix_(i, i)]))

            # update p
            detQuotient = (detS_cond**0.5) * detS
            assert no_nan(detQuotient)
            p_np[idx] *= detQuotient * exp(
                -0.5 * dot(diff_y_mu_J, dot(Sjj_inv, diff_y_mu_J)))
            assert no_nan(p_np[idx])
示例#4
0
    def _conditionout_categorical(self, cat_remove):
        if len(cat_remove) == 0:
            return

        pairs = dict(self._condition_values(names=cat_remove, pairflag=True))

        # _p changes like in the categoricals.py case
        # trim the probability look-up table to the appropriate subrange and normalize it
        p = self._p.loc[pairs]
        self._p = p / p.sum()
        assert no_nan(self._p), "Renormalization of p failed."

        # _mu and _S is trimmed: keep the slice that we condition on, i.e. reuse the 'pairs' access-structure
        # note: if we condition on all categoricals this also works: it simply remains the single 'selected' mu...
        if len(self._numericals) != 0:
            self._mu = self._mu.loc[pairs]
            self._S = self._S.loc[pairs]

        # update internals
        self._categoricals = [
            name for name in self._categoricals if name not in cat_remove
        ]
示例#5
0
 def _assert_invariants(self):
     for o in [self._detS, self._SInv, self._S, self._mu, self._p]:
         assert (no_nan(o))
示例#6
0
    def _conditionout_continuous(self, num_remove):
        if len(num_remove) == 0:
            return

        # collect singular values to condition out
        cond_values = self._condition_values(num_remove)

        # TODO: this is ugly. We should incoorperate _numericals, _categoricals in the base model already, then we could put the normalization there as well

        if hasattr(self, 'opts') and self.opts[
                'normalized']:  # this check for opts because cond_gaussians_wm don't have it, only mixable cgs and mcg use that function of cg wm
            cond_values = self._normalizer.norm(cond_values,
                                                mode="by name",
                                                names=num_remove)
            self._normalizer.update(num_remove=num_remove)

        # calculate updated mu and sigma for conditional distribution, according to GM script
        j = num_remove  # remove
        i = [name for name in self._numericals
             if name not in num_remove]  # keep

        cat_keep = self._mu.dims[:-1]
        all_num_removing = len(num_remove) == len(self._numericals)
        all_cat_removed = len(cat_keep) == 0

        if all_cat_removed:
            # special case: no categorical fields left. hence we cannot stack over them, it is only a single mu left
            # and we only need to update that
            sigma_expr = np.dot(self._S.loc[i, j],
                                inv(self._S.loc[j, j]))  # reused below
            assert no_nan(sigma_expr), "Sigma_expr contains nan"
            self._S = self._S.loc[i, i] - dot(
                sigma_expr, self._S.loc[j, i])  # upper Schur complement
            self._mu = self._mu.loc[i] + dot(sigma_expr,
                                             cond_values - self._mu.loc[j])
            # update p: there is nothing to update. p is empty
        else:
            # this is the actual difficult case
            #self._conditionout_continuous_internal_slow(cond_values, i, j, cat_keep, all_num_removing)
            self._conditionout_continuous_internal_fast(
                self._p, self._mu, self._detS, self._S, cond_values, i, j,
                all_num_removing)

            # rescale to one
            # TODO: is this wrong? why do we not automatically get a normalized model?
            psum = self._p.sum()
            if psum != 0:
                self._p /= psum
            else:
                logger.warning(
                    "creating a conditional model with extremely low probability and alike low predictive "
                    "power")
                self._p.values = np.full_like(self._p.values, 1 / self._p.size)

            # in the conditionout_continuous_internal_* we partially updated only the relevant part of mu and Sigma
            # the remaining part is now removed, i.e. sliced out
            if all_num_removing:
                self._mu = xr.DataArray([])
                self._S = xr.DataArray([])
            else:
                self._mu = self._mu.loc[dict(mean=i)]
                self._S = self._S.loc[dict(S1=i, S2=i)]

        self._numericals = [
            name for name in self._numericals if name not in num_remove
        ]