def _density(self, x): cat_len = len(self._categoricals) num_len = len(self._numericals) cat = tuple(x[:cat_len]) # need it as a tuple for indexing below num = np.array(x[cat_len:]) # need as np array for dot product p = self._p.loc[cat].values if num_len == 0: return p # works because gaussian variables are - by design of this class - after categoricals. # Therefore the only not specified dimension is the last one, i.e. the one that holds the mean! mu = self._mu.loc[cat].values detS = self._detS.loc[cat].values invS = self._SInv.loc[cat].values xmu = num - mu gauss = (2 * pi)**(-num_len / 2) * detS * exp( -.5 * np.dot(xmu, np.dot(invS, xmu))) assert no_nan(gauss), "Density computation failed." if cat_len == 0: return gauss else: return p * gauss
def _fit(self): assert (self.mode != 'none') self._p, self._mu, self._S = fit_full(self.data, self.fields, self._categoricals, self._numericals) for o in [self._p, self._mu, self._S]: assert (no_nan(o)) return self._unbound_updater,
def _conditionout_continuous_internal_fast(self, p_, mu_, detS_, S_, cond_values, i_names, j_names, all_num_removing): if all_num_removing: detS_cond = 1 # get numerical index for mu, sigma num_map = self._name_idx_map(mode='num') i = [num_map[v] for v in i_names] j = [num_map[v] for v in j_names] n = p_.size # number of single gaussians in the cg m = len(self._numericals) # gaussian dimension # extract numpy arrays and reshape to suitable form. This allows to iterate over the single parameters for S, mu, p by means of a standard python iterators p_np = p_.values.reshape(n) mu_np = mu_.values.reshape(n, m) detS_np = detS_.values.reshape(n) S_np = S_.values.reshape(n, m, m) for (idx, (p, mu, detS, S)) in enumerate(zip(p_np, mu_np, detS_np, S_np)): diff_y_mu_J = cond_values - mu[j] Sjj_inv = inv(S[ix_(j, j)]) assert no_nan(Sjj_inv), "Inversion of Covariance Matrix failed." if not all_num_removing: # update Sigma and mu sigma_expr = np.dot(S[ix_(i, j)], Sjj_inv) # reused below multiple times assert no_nan(sigma_expr), "Sigma_expr contains nan" S_np[idx][ix_(i, i)] -= dot(sigma_expr, S[ix_(j, i)]) # upper Schur complement mu_np[idx][i] += dot(sigma_expr, diff_y_mu_J) # this is for p update. Otherwise it is constant and calculated before the stacking loop detS_cond = abs(det(S_np[idx][ix_(i, i)])) # update p detQuotient = (detS_cond**0.5) * detS assert no_nan(detQuotient) p_np[idx] *= detQuotient * exp( -0.5 * dot(diff_y_mu_J, dot(Sjj_inv, diff_y_mu_J))) assert no_nan(p_np[idx])
def _conditionout_categorical(self, cat_remove): if len(cat_remove) == 0: return pairs = dict(self._condition_values(names=cat_remove, pairflag=True)) # _p changes like in the categoricals.py case # trim the probability look-up table to the appropriate subrange and normalize it p = self._p.loc[pairs] self._p = p / p.sum() assert no_nan(self._p), "Renormalization of p failed." # _mu and _S is trimmed: keep the slice that we condition on, i.e. reuse the 'pairs' access-structure # note: if we condition on all categoricals this also works: it simply remains the single 'selected' mu... if len(self._numericals) != 0: self._mu = self._mu.loc[pairs] self._S = self._S.loc[pairs] # update internals self._categoricals = [ name for name in self._categoricals if name not in cat_remove ]
def _assert_invariants(self): for o in [self._detS, self._SInv, self._S, self._mu, self._p]: assert (no_nan(o))
def _conditionout_continuous(self, num_remove): if len(num_remove) == 0: return # collect singular values to condition out cond_values = self._condition_values(num_remove) # TODO: this is ugly. We should incoorperate _numericals, _categoricals in the base model already, then we could put the normalization there as well if hasattr(self, 'opts') and self.opts[ 'normalized']: # this check for opts because cond_gaussians_wm don't have it, only mixable cgs and mcg use that function of cg wm cond_values = self._normalizer.norm(cond_values, mode="by name", names=num_remove) self._normalizer.update(num_remove=num_remove) # calculate updated mu and sigma for conditional distribution, according to GM script j = num_remove # remove i = [name for name in self._numericals if name not in num_remove] # keep cat_keep = self._mu.dims[:-1] all_num_removing = len(num_remove) == len(self._numericals) all_cat_removed = len(cat_keep) == 0 if all_cat_removed: # special case: no categorical fields left. hence we cannot stack over them, it is only a single mu left # and we only need to update that sigma_expr = np.dot(self._S.loc[i, j], inv(self._S.loc[j, j])) # reused below assert no_nan(sigma_expr), "Sigma_expr contains nan" self._S = self._S.loc[i, i] - dot( sigma_expr, self._S.loc[j, i]) # upper Schur complement self._mu = self._mu.loc[i] + dot(sigma_expr, cond_values - self._mu.loc[j]) # update p: there is nothing to update. p is empty else: # this is the actual difficult case #self._conditionout_continuous_internal_slow(cond_values, i, j, cat_keep, all_num_removing) self._conditionout_continuous_internal_fast( self._p, self._mu, self._detS, self._S, cond_values, i, j, all_num_removing) # rescale to one # TODO: is this wrong? why do we not automatically get a normalized model? psum = self._p.sum() if psum != 0: self._p /= psum else: logger.warning( "creating a conditional model with extremely low probability and alike low predictive " "power") self._p.values = np.full_like(self._p.values, 1 / self._p.size) # in the conditionout_continuous_internal_* we partially updated only the relevant part of mu and Sigma # the remaining part is now removed, i.e. sliced out if all_num_removing: self._mu = xr.DataArray([]) self._S = xr.DataArray([]) else: self._mu = self._mu.loc[dict(mean=i)] self._S = self._S.loc[dict(S1=i, S2=i)] self._numericals = [ name for name in self._numericals if name not in num_remove ]