CONFIG = \ """ model: """ output_dir = '/tmp/vae_pp' # network configuration batch_size = 32 max_iter = 50000 encoder = vi.NetConf([256, 256, 256], flatten_inputs=True, name='Encoder') decoder = vi.NetConf([256, 256, 256], flatten_inputs=True, name='Decoder') encoded_size = 16 posteriors_info = [ ('gaussian', 'mvndiag', 'mvntril'), ( D.Sample(D.Normal(loc=0., scale=1.), sample_shape=encoded_size, name='independent'), D.MultivariateNormalDiag(loc=tf.zeros(encoded_size), scale_diag=tf.ones(encoded_size), name='mvndiag'), D.MultivariateNormalTriL(loc=tf.zeros(encoded_size), scale_tril=bj.FillScaleTriL()(tf.ones( encoded_size * (encoded_size + 1) // 2)), name='mvntril'), D.MixtureSameFamily( components_distribution=D.MultivariateNormalDiag( loc=tf.zeros([10, encoded_size]), scale_diag=tf.ones([10, encoded_size])), mixture_distribution=D.Categorical(logits=tf.fill([10], 1.0 / 10)), name='gmm10'),
def _default_prior(event_shape, posterior, prior, posterior_kwargs): if not isinstance(event_shape, (Sequence, MutableSequence, tf.TensorShape)): raise ValueError("event_shape must be list of integer but given: " f"{event_shape} type: {type(event_shape)}") if isinstance(prior, (Distribution, DistributionLambda, Callable)): return prior elif not isinstance(prior, (string_types, type(None))): raise ValueError("prior must be string or instance of " f"Distribution or DistributionLambda, but given: {prior}") # no prior given layer, dist = parse_distribution(posterior) if isinstance(prior, dict): kw = dict(prior) prior = None else: kw = {} event_size = int(np.prod(event_shape)) ## helper function def _kwargs(**args): for k, v in args.items(): if k not in kw: kw[k] = v return kw ## Normal if layer == obl.GaussianLayer: prior = obd.Independent( obd.Normal(**_kwargs(loc=tf.zeros(shape=event_shape), scale=tf.ones(shape=event_shape))), reinterpreted_batch_ndims=1, ) ## Multivariate Normal elif issubclass(layer, obl.MultivariateNormalLayer): cov = layer._partial_kwargs['covariance'] if cov == 'diag': # diagonal covariance loc = tf.zeros(shape=event_shape) if tf.rank(loc) == 0: loc = tf.expand_dims(loc, axis=-1) prior = obd.MultivariateNormalDiag( **_kwargs(loc=loc, scale_identity_multiplier=1.)) else: # low-triangle covariance bijector = tfp.bijectors.FillScaleTriL( diag_bijector=tfp.bijectors.Identity(), diag_shift=1e-5) size = tf.reduce_prod(event_shape) loc = tf.zeros(shape=[size]) scale_tril = bijector.forward(tf.ones(shape=[size * (size + 1) // 2])) prior = obd.MultivariateNormalTriL( **_kwargs(loc=loc, scale_tril=scale_tril)) ## Log Normal elif layer == obl.LogNormalLayer: prior = obd.Independent( obd.LogNormal(**_kwargs(loc=tf.zeros(shape=event_shape), scale=tf.ones(shape=event_shape))), reinterpreted_batch_ndims=1, ) ## mixture elif issubclass(layer, obl.MixtureGaussianLayer): if hasattr(layer, '_partial_kwargs'): cov = layer._partial_kwargs['covariance'] else: cov = 'none' n_components = int(posterior_kwargs.get('n_components', 2)) if cov == 'diag': scale_shape = [n_components, event_size] fn = lambda l, s: obd.MultivariateNormalDiag(loc=l, scale_diag=tf.nn.softplus(s)) elif cov == 'none': scale_shape = [n_components, event_size] fn = lambda l, s: obd.Independent( obd.Normal(loc=l, scale=tf.math.softplus(s)), reinterpreted_batch_ndims=1, ) elif cov in ('full', 'tril'): scale_shape = [n_components, event_size * (event_size + 1) // 2] fn = lambda l, s: obd.MultivariateNormalTriL( loc=l, scale_tril=tfp.bijectors.FillScaleTriL(diag_shift=1e-5) (tf.math.softplus(s))) loc = tf.cast(tf.fill([n_components, event_size], 0.), dtype=tf.float32) log_scale = tf.cast(tf.fill(scale_shape, np.log(np.expm1(1.))), dtype=tf.float32) p = 1. / n_components mixture_logits = tf.cast(tf.fill([n_components], np.log(p / (1 - p))), dtype=tf.float32) prior = obd.MixtureSameFamily( components_distribution=fn(loc, log_scale), mixture_distribution=obd.Categorical(logits=mixture_logits)) ## discrete elif dist in (obd.OneHotCategorical, obd.Categorical) or \ layer == obl.RelaxedOneHotCategoricalLayer: p = 1. / event_size prior = dist(**_kwargs(logits=[np.log(p / (1 - p))] * event_size), dtype=tf.float32) elif dist == obd.Dirichlet: prior = dist(**_kwargs(concentration=[1.] * event_size)) elif dist == obd.Bernoulli: prior = obd.Independent( obd.Bernoulli(**_kwargs(logits=np.zeros(event_shape)), dtype=tf.float32), reinterpreted_batch_ndims=len(event_shape), ) ## other return prior
def _default_prior(event_shape, posterior, prior, posterior_kwargs): if isinstance(prior, obd.Distribution): return prior layer, dist = parse_distribution(posterior) if isinstance(prior, dict): kw = dict(prior) prior = None else: kw = {} event_size = int(np.prod(event_shape)) ## helper function def _kwargs(**args): for k, v in args.items(): if k not in kw: kw[k] = v return kw ## Normal if layer == obl.GaussianLayer: prior = obd.Independent( obd.Normal(**_kwargs(loc=tf.zeros(shape=event_shape), scale=tf.ones(shape=event_shape))), 1) ## Multivariate Normal elif issubclass(layer, obl.MultivariateNormalLayer): cov = layer._partial_kwargs['covariance'] if cov == 'diag': # diagonal covariance loc = tf.zeros(shape=event_shape) if tf.rank(loc) == 0: loc = tf.expand_dims(loc, axis=-1) prior = obd.MultivariateNormalDiag( **_kwargs(loc=loc, scale_identity_multiplier=1.)) else: # low-triangle covariance bijector = tfp.bijectors.FillScaleTriL( diag_bijector=tfp.bijectors.Identity(), diag_shift=1e-5) size = tf.reduce_prod(event_shape) loc = tf.zeros(shape=[size]) scale_tril = bijector.forward( tf.ones(shape=[size * (size + 1) // 2])) prior = obd.MultivariateNormalTriL( **_kwargs(loc=loc, scale_tril=scale_tril)) ## Log Normal elif layer == obl.LogNormalLayer: prior = obd.Independent( obd.LogNormal(**_kwargs(loc=tf.zeros(shape=event_shape), scale=tf.ones(shape=event_shape))), 1) ## mixture elif issubclass(layer, obl.MixtureGaussianLayer): if hasattr(layer, '_partial_kwargs'): cov = layer._partial_kwargs['covariance'] else: cov = 'none' n_components = int(posterior_kwargs.get('n_components', 2)) if cov == 'diag': scale_shape = [n_components, event_size] fn = lambda l, s: obd.MultivariateNormalDiag( loc=l, scale_diag=tf.nn.softplus(s)) elif cov == 'none': scale_shape = [n_components, event_size] fn = lambda l, s: obd.Independent( obd.Normal(loc=l, scale=tf.math.softplus(s)), 1) elif cov in ('full', 'tril'): scale_shape = [n_components, event_size * (event_size + 1) // 2] fn = lambda l, s: obd.MultivariateNormalTriL( loc=l, scale_tril=tfp.bijectors.FillScaleTriL(diag_shift=1e-5) (tf.math.softplus(s))) loc = tf.cast(tf.fill([n_components, event_size], 0.), dtype=tf.float32) log_scale = tf.cast(tf.fill(scale_shape, np.log(np.expm1(1.))), dtype=tf.float32) mixture_logits = tf.cast(tf.fill([n_components], 1.), dtype=tf.float32) prior = obd.MixtureSameFamily( components_distribution=fn(loc, log_scale), mixture_distribution=obd.Categorical(logits=mixture_logits)) ## discrete elif dist in (obd.OneHotCategorical, obd.Categorical) or \ layer == obl.RelaxedOneHotCategoricalLayer: prior = dist(**_kwargs(logits=np.log([1. / event_size] * event_size), dtype=tf.float32)) elif dist == obd.Dirichlet: prior = dist(**_kwargs(concentration=[1.] * event_size)) elif dist == obd.Bernoulli: prior = obd.Independent( obd.Bernoulli(**_kwargs(logits=np.full(event_shape, np.log(0.5)), dtype=tf.float32)), len(event_shape)) ## other return prior
def create_posterior(self, input_shape: Optional[List[int]] = None, name: Optional[str] = None) -> obl.DenseDistribution: r""" Initiate a Distribution for the random variable """ # use Gaussian noise as prior distribution for deterministic case if self.is_deterministic: prior = obd.Independent( obd.Normal(loc=tf.zeros(shape=self.event_shape), scale=tf.ones(shape=self.event_shape)), reinterpreted_batch_ndims=1, ) else: prior = _default_prior(self.event_shape, self.posterior, self.prior, self.kwargs) event_shape = self.event_shape posterior = self.posterior posterior_kwargs = dict(self.kwargs) name = self.name if name is None else str(name) # ====== deterministic distribution with loss function from tensorflow ====== # if posterior in dir(tf.losses) or posterior in dir(keras.activations): distribution_layer = obl.VectorDeterministicLayer if posterior in dir(tf.losses): activation = 'linear' fn = tf.losses.get(str(posterior)) else: # just activation function, loss default MSE activation = keras.activations.get(self.posterior) fn = tf.losses.get(posterior_kwargs.pop('loss', 'mse')) posterior_kwargs['log_prob'] = \ lambda self, y_true: -fn(y_true, self.mean()) # ====== probabilistic loss ====== # else: distribution_layer = parse_distribution(self.posterior)[0] activation = self.preactivation # ====== create distribution layers ====== # kw = dict(projection=self.projection) if input_shape is not None: kw['input_shape'] = input_shape ### create the layer ## mixture distributions if posterior in ('mdn', 'mixdiag', 'mixfull', 'mixtril'): posterior_kwargs.pop('covariance', None) posterior_kwargs.update(kw) # dense network for projection layer = obl.MixtureDensityNetwork(event_shape, loc_activation=activation, scale_activation='softplus1', covariance=dict( mdn='none', mdndiag='diag', mdnfull='tril', mdntril='tril')[posterior], name=name, prior=prior, dropout=self.dropout, **posterior_kwargs) ## non-mixture distribution else: layer = obl.DenseDistribution(event_shape, posterior=distribution_layer, prior=prior, activation=activation, posterior_kwargs=posterior_kwargs, dropout=self.dropout, name=name, **kw) ### set attributes if not hasattr(layer, 'event_shape'): layer.event_shape = event_shape # build the layer in advance if input_shape is not None and layer.projection: inputs = keras.Input(shape=input_shape, batch_size=None) layer(inputs) return layer
def create_divergence_matrix(self, n_samples=1000, lognorm=True, n_components=2, normalize_per_code=True, decode=False): r""" Using GMM fitted on the factors to estimate the divergence to each latent code. It means calculating the divergence: `DKL(q(z|x)||p(y))`, where: - q(z|x) is latent code of Gaussian distribution - p(y) is factor of Gaussian mixture model with `n_components` The calculation is repeated for each pair of (code, factor). This method is recommended for factors that are continuous values. Return: a matrix of shape `[n_codes, n_factors]` """ n_samples = int(n_samples) n_codes = self.n_codes n_factors = self.n_factors matrices = [] for qZ, y in zip(self.representations, self.original_factors): ### normalizing the factors if lognorm: y = np.log1p(y) # standardizing for each factor y = (y - np.mean(y, axis=0, keepdims=True)) / ( np.std(y, axis=0, keepdims=True) + 1e-10) ### train the Gaussian mixture on the factors f_gmm = [] for fidx, (f, fname) in enumerate(zip(y.T, self.factor_names)): gmm = tfd.GaussianMixture.init(f[:, np.newaxis], n_components=n_components, covariance_type='diag', batch_shape=None, dtype=tf.float64, name=fname) f_gmm.append(gmm) ### the code Gaussian z_gau = [] for mean, stddev, code_name in zip(tf.transpose(qZ.mean()), tf.transpose(qZ.stddev()), self.code_names): mean = tf.cast(mean, tf.float64) stddev = tf.cast(stddev, tf.float64) z_gau.append( tfd.Independent(tfd.Normal(loc=mean, scale=stddev, name=code_name), reinterpreted_batch_ndims=1)) ### calculate the KL divergence density_matrix = np.empty(shape=(n_codes, n_factors), dtype=np.float64) for zidx, gau in enumerate(z_gau): for fidx, gmm in enumerate(f_gmm): # non-analytic KL(q=gau||p=gmm) samples = gau.sample(n_samples) with tf.device("/CPU:0"): qllk = gau.log_prob(samples) pllk = tf.reduce_sum(tf.reshape( gmm.log_prob(tf.reshape(samples, (-1, 1))), (n_samples, -1)), axis=1) kl = tf.reduce_mean(qllk - pllk) density_matrix[zidx, fidx] = kl.numpy() if bool(normalize_per_code): density_matrix = density_matrix / np.sum( density_matrix, axis=1, keepdims=True) matrices.append(density_matrix) ### decoding and return train, test = matrices if decode: ids = search.diagonal_linear_assignment(train.T) train = train[ids] test = test[ids] return train, test, ids return train, test