def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar
def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, W=init.Normal(init_w, mean=.0), b=init.Normal(init_w), nonlinearity=nonlin) logvar = DenseLayer(layer_in, n, W=init.Normal(init_w, mean=.0), b=init.Normal(init_w), nonlinearity=nonlin) # logvar = ConstrainLayer(logvar, scale=1, max=T.log(-0.999 * self.sym_warmup + 1.0999)) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar
def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = bn(dense) return NonlinearityLayer(dense, self.transf)
def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, num_units=n, W=dist_w(hid_w), b=dist_b(init_w), nonlinearity=None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf)
def __init__(self, n_x, n_z, qz_hid, px_hid, filters, seq_length=50, nonlinearity=rectify, px_nonlinearity=None, x_dist='linear', batchnorm=False, seed=1234): """ Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_x: Number of inputs. :param n_z: Number of latent. :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(CVAE, self).__init__(n_x, qz_hid + px_hid, n_z, nonlinearity) self.x_dist = x_dist self.n_x = n_x self.seq_length = seq_length self.n_z = n_z self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Pool layer cache pool_layers = [] # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" # Define symbolic variables for theano functions. self.sym_x = T.tensor3('x') # inputs self.sym_z = T.matrix('z') self.sym_samples = T.iscalar('samples') # MC samples # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = bn(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar def conv_layer(layer_in, filter, stride=(1, 1), pool=1, name='conv'): l_conv = Conv2DLayer(layer_in, num_filters=filter, filter_size=(3, 1), stride=stride, pad='full', name=name) if pool > 1: l_conv = MaxPool2DLayer(l_conv, pool_size=(pool, 1)) pool_layers.append(l_conv) return l_conv # Reshape input l_x_in = InputLayer((None, seq_length, n_x), name='Input') l_x_in_reshp = ReshapeLayer(l_x_in, (-1, 1, seq_length, n_x)) print("l_x_in_reshp", l_x_in_reshp.output_shape) # CNN encoder implementation l_conv_enc = l_x_in_reshp for filter, stride, pool in filters: l_conv_enc = conv_layer(l_conv_enc, filter, stride, pool) print("l_conv_enc", l_conv_enc.output_shape) # Pool along last 2 axes l_global_pool_enc = GlobalPoolLayer(l_conv_enc) l_enc = dense_layer(l_global_pool_enc, n_z) print("l_enc", l_enc.output_shape) # Recognition q(z|x) l_qz = l_enc for hid in qz_hid: l_qz = dense_layer(l_qz, hid) l_qz, l_qz_mu, l_qz_logvar = stochastic_layer(l_qz, n_z, self.sym_samples) print("l_qz", l_qz.output_shape) # Inverse pooling l_global_depool = InverseLayer(l_qz, l_global_pool_enc) print("l_global_depool", l_global_depool.output_shape) # Reverse pool layer order pool_layers = pool_layers[::-1] # Decode l_deconv = l_global_depool for idx, filter in enumerate(filters[::-1]): filter, stride, pool = filter if pool > 1: l_deconv = InverseLayer(l_deconv, pool_layers[idx]) l_deconv = Conv2DLayer(l_deconv, num_filters=filter, filter_size=(3, 1), stride=(stride, 1), W=init.GlorotNormal('relu')) print("l_deconv", l_deconv.output_shape) # The last l_conv layer should give us the input shape l_dec = Conv2DLayer(l_deconv, num_filters=1, filter_size=(3, 1), pad='same', nonlinearity=None) print("l_dec", l_dec.output_shape) # Flatten first two dimensions l_dec = ReshapeLayer(l_dec, (-1, n_x)) l_px = l_dec if x_dist == 'bernoulli': l_px = DenseLayer(l_px, n_x, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px = DenseLayer(l_px, n_x, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px, l_px_mu, l_px_logvar = stochastic_layer( l_px, n_x, self.sym_samples, px_nonlinearity) elif x_dist == 'linear': l_px = DenseLayer(l_px, n_x, nonlinearity=None) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_qz = ReshapeLayer(l_qz, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = DimshuffleLayer(l_qz_mu, (0, 'x', 'x', 1)) self.l_qz_logvar = DimshuffleLayer(l_qz_logvar, (0, 'x', 'x', 1)) self.l_px = DimshuffleLayer( ReshapeLayer(l_px, (-1, seq_length, self.sym_samples, 1, n_x)), (0, 2, 3, 1, 4)) self.l_px_mu = DimshuffleLayer(ReshapeLayer(l_px_mu, (-1, seq_length, self.sym_samples, 1, n_x)), (0, 2, 3, 1, 4)) \ if x_dist == "gaussian" else None self.l_px_logvar = DimshuffleLayer(ReshapeLayer(l_px_logvar, (-1, seq_length, self.sym_samples, 1, n_x)), (0, 2, 3, 1, 4)) \ if x_dist == "gaussian" else None # Predefined functions inputs = {self.l_x_in: self.sym_x} outputs = get_output(l_qz, inputs, deterministic=True) self.f_qz = theano.function([self.sym_x, self.sym_samples], outputs) inputs = {l_qz: self.sym_z} outputs = get_output(self.l_px, inputs, deterministic=True).mean(axis=(1, 2)) self.f_px = theano.function([self.sym_z, self.sym_samples], outputs) outputs = get_output(self.l_px_mu, inputs, deterministic=True).mean(axis=(1, 2)) self.f_mu = theano.function([self.sym_z, self.sym_samples], outputs) outputs = get_output(self.l_px_logvar, inputs, deterministic=True).mean(axis=(1, 2)) self.f_var = theano.function([self.sym_z, self.sym_samples], outputs) # Define model parameters self.model_params = get_all_params([self.l_px]) self.trainable_model_params = get_all_params([self.l_px], trainable=True)
def __init__(self, n_c, px_hid, enc_rnn=256, dec_rnn=256, n_l=50, nonlinearity=rectify, batchnorm=False, seed=1234): """ Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_c: Number of inputs. :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(RAE, self).__init__(n_c, px_hid, enc_rnn, nonlinearity) self.n_x = n_c self.max_seq_length = n_l self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" # Define symbolic variables for theano functions. self.sym_x = T.tensor3('x') # inputs # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), nonlinearity=None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def lstm_layer(input, nunits, return_final, backwards=False, name='LSTM'): ingate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) forgetgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(5.0)) cell = Gate( W_cell=None, nonlinearity=T.tanh, W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), ) outgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) lstm = LSTMLayer(input, num_units=nunits, backwards=backwards, peepholes=False, ingate=ingate, forgetgate=forgetgate, cell=cell, outgate=outgate, name=name, only_return_final=return_final) rec = RecurrentLayer(input, num_units=nunits, W_in_to_hid=init.GlorotNormal('relu'), W_hid_to_hid=init.GlorotNormal('relu'), backwards=backwards, nonlinearity=rectify, only_return_final=return_final, name=name) return lstm # RNN encoder implementation l_x_in = InputLayer((None, None, n_c)) l_enc_forward = lstm_layer(l_x_in, enc_rnn, return_final=True, backwards=False, name='enc_forward') l_enc_backward = lstm_layer(l_x_in, enc_rnn, return_final=True, backwards=True, name='enc_backward') l_enc_concat = ConcatLayer([l_enc_forward, l_enc_backward], axis=-1) l_enc = dense_layer(l_enc_concat, enc_rnn) # RNN decoder implementation l_dec_repeat = RepeatLayer(l_enc, n=n_l) l_dec_forward = lstm_layer(l_dec_repeat, dec_rnn, return_final=False, backwards=False, name='dec_forward') l_dec_backward = lstm_layer(l_dec_repeat, dec_rnn, return_final=False, backwards=True, name='dec_backward') l_dec_concat = ConcatLayer([l_dec_forward, l_dec_backward], axis=-1) l_dec = ReshapeLayer(l_dec_concat, (-1, 2 * dec_rnn)) l_dec = dense_layer(l_dec, dec_rnn) # Generative p(x_hat|x) l_px = l_dec for hid in px_hid: l_px = dense_layer(l_px, hid) # Output self.l_enc = l_enc l_px = DenseLayer(l_px, n_c, nonlinearity=None) self.l_px = ReshapeLayer(l_px, (-1, n_l, n_c)) self.l_x_in = l_x_in inputs = {l_x_in: self.sym_x} outputs = get_output(self.l_px, inputs, deterministic=True) self.f_px = theano.function([self.sym_x], outputs, on_unused_input='warn') # Define model parameters self.encoder_params = get_all_param_values(self.l_enc) self.model_params = get_all_params(self.l_px) self.trainable_model_params = get_all_params(self.l_px, trainable=True)
def __init__(self, n_c, n_l, n_a, n_z, n_y, qa_hid, qz_hid, qy_hid, px_hid, pa_hid, filters, nonlinearity=rectify, px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234): """ Initialize an skip deep generative model consisting of discriminative classifier q(y|a,x), generative model P p(a|z,y) and p(x|a,z,y), inference model Q q(a|x) and q(z|a,x,y). Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_c: Number of input channels. :param n_l: Number of lengths. :param n_a: Number of auxiliary. :param n_z: Number of latent. :param n_y: Number of classes. :param qa_hid: List of number of deterministic hidden q(a|x). :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param qy_hid: List of number of deterministic hidden q(y|a,x). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(CSDGM, self).__init__(n_c, qz_hid + px_hid, n_a + n_z, nonlinearity) self.x_dist = x_dist self.n_y = n_y self.n_c = n_c self.n_l = n_l self.n_a = n_a self.n_z = n_z self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" pool_layers = [] # Define symbolic variables for theano functions. self.sym_beta = T.scalar('beta') # scaling constant beta self.sym_x_l = T.tensor3('x') # labeled inputs self.sym_t_l = T.matrix('t') # labeled targets self.sym_x_u = T.tensor3('x') # unlabeled inputs self.sym_bs_l = T.iscalar('bs_l') # number of labeled data self.sym_samples = T.iscalar('samples') # MC samples self.sym_z = T.matrix('z') # latent variable z self.sym_a = T.matrix('a') # auxiliary variable a self.sym_warmup = T.fscalar('warmup') # warmup to scale KL term # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar def conv_layer(layer_in, filter, stride=(1, 1), pool=1, name='conv', dist_w=init.GlorotNormal, dist_b=init.Normal): l_conv = Conv2DLayer(layer_in, num_filters=filter, filter_size=(3, 1), stride=stride, pad='full', W=dist_w(hid_w), b=dist_b(init_w), name=name) if pool > 1: l_conv = MaxPool2DLayer(l_conv, pool_size=(pool, 1)) pool_layers.append(l_conv) return l_conv # Input layers l_y_in = InputLayer((None, n_y)) l_x_in = InputLayer((None, n_l, n_c), name='Input') # Reshape input l_x_in_reshp = ReshapeLayer(l_x_in, (-1, 1, n_l, n_c)) print("l_x_in_reshp", l_x_in_reshp.output_shape) # CNN encoder implementation l_conv_enc = l_x_in_reshp for filter, stride, pool in filters: l_conv_enc = conv_layer(l_conv_enc, filter, stride, pool) print("l_conv_enc", l_conv_enc.output_shape) # Pool along last 2 axes l_global_pool_enc = GlobalPoolLayer(l_conv_enc, pool_function=T.mean) l_enc = dense_layer(l_global_pool_enc, n_z) print("l_enc", l_enc.output_shape) # Auxiliary q(a|x) l_qa_x = l_enc for hid in qa_hid: l_qa_x = dense_layer(l_qa_x, hid) l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer( l_qa_x, n_a, self.sym_samples) # Classifier q(y|a,x) l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qy = ReshapeLayer(l_qa_to_qy, (-1, self.sym_samples, 1, qy_hid[0])) l_x_to_qy = DenseLayer(l_enc, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1)) l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]), (-1, qy_hid[0])) if batchnorm: l_qy_xa = BatchNormLayer(l_qy_xa) l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf) if len(qy_hid) > 1: for hid in qy_hid[1:]: l_qy_xa = dense_layer(l_qy_xa, hid) l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(), init.Normal(init_w), softmax) # Recognition q(z|x,a,y) l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qz = ReshapeLayer(l_qa_to_qz, (-1, self.sym_samples, 1, qz_hid[0])) l_x_to_qz = DenseLayer(l_enc, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1)) l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1)) l_qz_axy = ReshapeLayer( ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]), (-1, qz_hid[0])) if batchnorm: l_qz_axy = BatchNormLayer(l_qz_axy) l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf) if len(qz_hid) > 1: for hid in qz_hid[1:]: l_qz_axy = dense_layer(l_qz_axy, hid) l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer( l_qz_axy, n_z, 1) # Generative p(a|z,y) l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1)) l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_pa = ReshapeLayer(l_qz_to_pa, (-1, self.sym_samples, 1, pa_hid[0])) l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]), [-1, pa_hid[0]]) if batchnorm: l_pa_zy = BatchNormLayer(l_pa_zy) l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf) if len(pa_hid) > 1: for hid in pa_hid[1:]: l_pa_zy = dense_layer(l_pa_zy, hid) l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1) # Generative p(x|a,z,y) l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_px = ReshapeLayer(l_qa_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1)) l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_px = ReshapeLayer(l_qz_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_px_azy = ReshapeLayer( ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]), [-1, px_hid[0]]) if batchnorm: l_px_azy = BatchNormLayer(l_px_azy) l_px_azy = NonlinearityLayer(l_px_azy, self.transf) # Note that px_hid[0] has to be equal to the number filters in the first convolution. Otherwise add a # dense layers here. # Inverse pooling l_global_depool = InverseLayer(l_px_azy, l_global_pool_enc) print("l_global_depool", l_global_depool.output_shape) # Reverse pool layer order pool_layers = pool_layers[::-1] # Decode l_deconv = l_global_depool for idx, filter in enumerate(filters[::-1]): filter, stride, pool = filter if pool > 1: l_deconv = InverseLayer(l_deconv, pool_layers[idx]) l_deconv = Conv2DLayer(l_deconv, num_filters=filter, filter_size=(3, 1), stride=(stride, 1), W=init.GlorotNormal('relu')) print("l_deconv", l_deconv.output_shape) # The last l_conv layer should give us the input shape l_px_azy = Conv2DLayer(l_deconv, num_filters=1, filter_size=(3, 1), pad='same', nonlinearity=None) print("l_dec", l_px_azy.output_shape) # Flatten first two dimensions l_px_azy = ReshapeLayer(l_px_azy, (-1, n_c)) if x_dist == 'bernoulli': l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer( l_px_azy, n_c, self.sym_samples, px_nonlinearity) elif x_dist == 'linear': l_px_azy = DenseLayer(l_px_azy, n_c, nonlinearity=None) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_y_in = l_y_in self.l_a_in = l_qa_x self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a)) self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1)) self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1)) self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = ReshapeLayer(l_qz_axy_mu, (-1, self.sym_samples, 1, n_z)) self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar, (-1, self.sym_samples, 1, n_z)) self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y)) self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a)) self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a)) self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar, (-1, self.sym_samples, 1, n_a)) # Here we assume that we pass (batch size * segment length, number of features) to the sample layer from # which we then get (batch size * segment length, samples, IW samples, features) self.l_px = ReshapeLayer(l_px_azy, (-1, n_l, self.sym_samples, 1, n_c)) self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, n_l, self.sym_samples, 1, n_c)) \ if x_dist == "gaussian" else None self.l_px_logvar = ReshapeLayer(l_px_zy_logvar, (-1, n_l, self.sym_samples, 1, n_c)) \ if x_dist == "gaussian" else None # Predefined functions inputs = {l_x_in: self.sym_x_l} outputs = get_output(self.l_qy, inputs, deterministic=True).mean(axis=(1, 2)) self.f_qy = theano.function([self.sym_x_l, self.sym_samples], outputs) outputs = get_output(l_qa_x, inputs, deterministic=True) self.f_qa = theano.function([self.sym_x_l, self.sym_samples], outputs) inputs = {l_x_in: self.sym_x_l, l_y_in: self.sym_t_l} outputs = get_output(l_qz_axy, inputs, deterministic=True) self.f_qz = theano.function( [self.sym_x_l, self.sym_t_l, self.sym_samples], outputs) inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_pa, inputs, deterministic=True).mean(axis=(1, 2)) self.f_pa = theano.function( [self.sym_z, self.sym_t_l, self.sym_samples], outputs) inputs = { l_x_in: self.sym_x_l, l_qa_x: self.sym_a, l_qz_axy: self.sym_z, l_y_in: self.sym_t_l } outputs = get_output(self.l_px, inputs, deterministic=True).mean(axis=(2, 3)) self.f_px = theano.function([ self.sym_x_l, self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples ], outputs) outputs = get_output(self.l_px_mu, inputs, deterministic=True).mean(axis=(2, 3)) self.f_mu = theano.function([ self.sym_x_l, self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples ], outputs) outputs = get_output(self.l_px_logvar, inputs, deterministic=True).mean(axis=(2, 3)) self.f_var = theano.function([ self.sym_x_l, self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples ], outputs) # Define model parameters self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px]) self.trainable_model_params = get_all_params( [self.l_qy, self.l_pa, self.l_px], trainable=True)
def __init__(self, n_x, n_z, z_hidden, xhat_hidden, trans_func=rectify, init_w=1e-3, x_dist='gaussian', batchnorm=False): super(VAE, self).__init__(n_x, z_hidden + xhat_hidden, n_z, trans_func) self.n_x = n_x self.n_z = n_z self.x_dist = x_dist self.batchnorm = batchnorm self.sym_x = T.matrix('x') # symbolic inputs self.sym_z = T.matrix('z') self.sym_samples = T.iscalar('samples') self._srng = RandomStreams() def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar # Input l_x_in = InputLayer((None, n_x)) # Inference q(z|x) l_z_x = l_x_in for hid in z_hidden: l_z_x = DenseLayer(l_z_x, hid, init.Normal(std=init_w), init.Normal(std=init_w), self.transf) l_z_x, l_z_x_mu, l_z_x_logvar = stochastic_layer( l_z_x, n_z, self.sym_samples) # Reshape for density layers l_z_x_reshaped = ReshapeLayer(l_z_x, (-1, self.sym_samples, n_z)) l_z_x_mu_reshaped = DimshuffleLayer(l_z_x_mu, (0, 'x', 1)) l_z_x_logvar_reshaped = DimshuffleLayer(l_z_x_logvar, (0, 'x', 1)) # Generative p(xhat|z) l_xhat_z = l_z_x for hid in xhat_hidden: l_xhat_z = DenseLayer(l_xhat_z, hid, init.Normal(std=init_w), init.Normal(std=init_w), self.transf) if x_dist == 'bernoulli': l_xhat_z_mu_reshaped = None l_xhat_z_logvar_reshaped = None l_xhat_z = DenseLayer(l_xhat_z, n_x, init.Normal(std=init_w), init.Normal(std=init_w), sigmoid) elif x_dist == 'gaussian': l_xhat_z, l_xhat_z_mu, l_xhat_z_logvar = stochastic_layer( l_xhat_z, n_x, self.sym_samples) l_xhat_z_mu_reshaped = ReshapeLayer(l_xhat_z_mu, (-1, self.sym_samples, 1, n_x)) l_xhat_z_logvar_reshaped = ReshapeLayer( l_xhat_z_logvar, (-1, self.sym_samples, 1, n_x)) l_xhat_z_reshaped = ReshapeLayer(l_xhat_z, (-1, self.sym_samples, 1, n_x)) # Init class variables self.l_x_in = l_x_in self.l_xhat_mu = l_xhat_z_mu_reshaped self.l_xhat_logvar = l_xhat_z_logvar_reshaped self.l_xhat = l_xhat_z_reshaped self.l_z = l_z_x_reshaped self.l_z_mu = l_z_x_mu_reshaped self.l_z_logvar = l_z_x_logvar_reshaped self.model_params = get_all_params(self.l_xhat) inputs = [self.sym_x, self.sym_samples] outputs = get_output(self.l_z, self.sym_x, deterministic=True).mean(axis=1) self.f_qz = theano.function(inputs, outputs) inputs = {l_z_x: self.sym_z} outputs = get_output(self.l_xhat, inputs, deterministic=True).mean(axis=(1, 2)) inputs = [self.sym_z, self.sym_samples] self.f_px = theano.function(inputs, outputs)
def __init__(self, n_x, n_a, n_z, n_y, qa_hid, qz_hid, qy_hid, px_hid, pa_hid, nonlinearity=rectify, px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234): """ Initialize an skip deep generative model consisting of discriminative classifier q(y|a,x), generative model P p(a|z,y) and p(x|a,z,y), inference model Q q(a|x) and q(z|a,x,y). Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_x: Number of inputs. :param n_a: Number of auxiliary. :param n_z: Number of latent. :param n_y: Number of classes. :param qa_hid: List of number of deterministic hidden q(a|x). :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param qy_hid: List of number of deterministic hidden q(y|a,x). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(SDGMSSL, self).__init__(n_x, qz_hid + px_hid, n_a + n_z, nonlinearity) self.x_dist = x_dist self.n_y = n_y self.n_x = n_x self.n_a = n_a self.n_z = n_z self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" # Define symbolic variables for theano functions. self.sym_beta = T.scalar('beta') # scaling constant beta self.sym_x_l = T.matrix('x') # labeled inputs self.sym_t_l = T.matrix('t') # labeled targets self.sym_x_u = T.matrix('x') # unlabeled inputs self.sym_bs_l = T.iscalar('bs_l') # number of labeled data self.sym_samples = T.iscalar('samples') # MC samples self.sym_z = T.matrix('z') # latent variable z self.sym_a = T.matrix('a') # auxiliary variable a # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar # Input layers l_x_in = InputLayer((None, n_x)) l_y_in = InputLayer((None, n_y)) # Auxiliary q(a|x) l_qa_x = l_x_in for hid in qa_hid: l_qa_x = dense_layer(l_qa_x, hid) l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer(l_qa_x, n_a, self.sym_samples) # Classifier q(y|a,x) l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qy = ReshapeLayer(l_qa_to_qy, (-1, self.sym_samples, 1, qy_hid[0])) l_x_to_qy = DenseLayer(l_x_in, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1)) l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]), (-1, qy_hid[0])) if batchnorm: l_qy_xa = BatchNormLayer(l_qy_xa) l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf) if len(qy_hid) > 1: for hid in qy_hid[1:]: l_qy_xa = dense_layer(l_qy_xa, hid) l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(), init.Normal(init_w), softmax) # Recognition q(z|x,a,y) l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qz = ReshapeLayer(l_qa_to_qz, (-1, self.sym_samples, 1, qz_hid[0])) l_x_to_qz = DenseLayer(l_x_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1)) l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1)) l_qz_axy = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]), (-1, qz_hid[0])) if batchnorm: l_qz_axy = BatchNormLayer(l_qz_axy) l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf) if len(qz_hid) > 1: for hid in qz_hid[1:]: l_qz_axy = dense_layer(l_qz_axy, hid) l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer(l_qz_axy, n_z, 1) # Generative p(a|z,y) l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1)) l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_pa = ReshapeLayer(l_qz_to_pa, (-1, self.sym_samples, 1, pa_hid[0])) l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]), [-1, pa_hid[0]]) if batchnorm: l_pa_zy = BatchNormLayer(l_pa_zy) l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf) if len(pa_hid) > 1: for hid in pa_hid[1:]: l_pa_zy = dense_layer(l_pa_zy, hid) l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1) # Generative p(x|a,z,y) l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_px = ReshapeLayer(l_qa_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1)) l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_px = ReshapeLayer(l_qz_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_px_azy = ReshapeLayer(ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]), [-1, px_hid[0]]) if batchnorm: l_px_azy = BatchNormLayer(l_px_azy) l_px_azy = NonlinearityLayer(l_px_azy, self.transf) if len(px_hid) > 1: for hid in px_hid[1:]: l_px_azy = dense_layer(l_px_azy, hid) if x_dist == 'bernoulli': l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer(l_px_azy, n_x, 1, px_nonlinearity) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_y_in = l_y_in self.l_a_in = l_qa_x self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a)) self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1)) self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1)) self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = ReshapeLayer(l_qz_axy_mu, (-1, self.sym_samples, 1, n_z)) self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar, (-1, self.sym_samples, 1, n_z)) self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y)) self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a)) self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a)) self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar, (-1, self.sym_samples, 1, n_a)) self.l_px = ReshapeLayer(l_px_azy, (-1, self.sym_samples, 1, n_x)) self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None self.l_px_logvar = ReshapeLayer(l_px_zy_logvar, (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None # Predefined functions inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qy = theano.function(inputs, outputs) inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qa, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qa = theano.function(inputs, outputs) inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_pa, inputs, deterministic=True) self.f_pa = theano.function([self.sym_z, self.sym_t_l, self.sym_samples], outputs) inputs = {l_qa_x: self.sym_a, l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_px, inputs, deterministic=True) self.f_px = theano.function([self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs) # Define model parameters self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px]) self.trainable_model_params = get_all_params([self.l_qy, self.l_pa, self.l_px], trainable=True)
def __init__(self, n_x, n_a, n_z, n_y, a_hidden, z_hidden, xhat_hidden, y_hidden, trans_func=rectify, x_dist='bernoulli'): """ Initialize an auxiliary deep generative model consisting of discriminative classifier q(y|a,x), generative model P p(xhat|z,y), inference model Q q(a|x) and q(z|x,y). All weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_x: Number of inputs. :param n_a: Number of auxiliary. :param n_z: Number of latent. :param n_y: Number of classes. :param a_hidden: List of number of deterministic hidden q(a|x). :param z_hidden: List of number of deterministic hidden q(z|x,y). :param xhat_hidden: List of number of deterministic hidden p(xhat|z,y). :param y_hidden: List of number of deterministic hidden q(y|a,x). :param trans_func: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli' or 'gaussian'. """ super(ADGMSSL, self).__init__(n_x, a_hidden + z_hidden + xhat_hidden, n_a + n_z, trans_func) self.y_hidden = y_hidden self.x_dist = x_dist self.n_y = n_y self.n_x = n_x self.n_a = n_a self.n_z = n_z self._srng = RandomStreams() self.sym_beta = T.scalar( 'beta') # symbolic upscaling of the discriminative term. self.sym_x_l = T.matrix('x') # symbolic labeled inputs self.sym_t_l = T.matrix('t') # symbolic labeled targets self.sym_x_u = T.matrix('x') # symbolic unlabeled inputs self.sym_bs_l = T.iscalar( 'bs_l' ) # symbolic number of labeled data_preparation points in batch self.sym_samples = T.iscalar( 'samples') # symbolic number of Monte Carlo samples self.sym_y = T.matrix('y') self.sym_z = T.matrix('z') ### Input layers ### l_x_in = InputLayer((None, n_x)) l_y_in = InputLayer((None, n_y)) ### Auxiliary q(a|x) ### l_a_x = l_x_in for hid in a_hidden: l_a_x = DenseLayer(l_a_x, hid, init.GlorotNormal('relu'), init.Normal(1e-3), self.transf) l_a_x_mu = DenseLayer(l_a_x, n_a, init.GlorotNormal(), init.Normal(1e-3), None) l_a_x_logvar = DenseLayer(l_a_x, n_a, init.GlorotNormal(), init.Normal(1e-3), None) l_a_x = SampleLayer(l_a_x_mu, l_a_x_logvar, eq_samples=self.sym_samples) # Reshape all layers to align them for multiple samples in the lower bound calculation. l_a_x_reshaped = ReshapeLayer(l_a_x, (-1, self.sym_samples, 1, n_a)) l_a_x_mu_reshaped = DimshuffleLayer(l_a_x_mu, (0, 'x', 'x', 1)) l_a_x_logvar_reshaped = DimshuffleLayer(l_a_x_logvar, (0, 'x', 'x', 1)) ### Classifier q(y|a,x) ### # Concatenate the input x and the output of the auxiliary MLP. l_a_to_y = DenseLayer(l_a_x, y_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_a_to_y = ReshapeLayer(l_a_to_y, (-1, self.sym_samples, 1, y_hidden[0])) l_x_to_y = DenseLayer(l_x_in, y_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_x_to_y = DimshuffleLayer(l_x_to_y, (0, 'x', 'x', 1)) l_y_xa = ReshapeLayer(ElemwiseSumLayer([l_a_to_y, l_x_to_y]), (-1, y_hidden[0])) l_y_xa = NonlinearityLayer(l_y_xa, self.transf) if len(y_hidden) > 1: for hid in y_hidden[1:]: l_y_xa = DenseLayer(l_y_xa, hid, init.GlorotUniform('relu'), init.Normal(1e-3), self.transf) l_y_xa = DenseLayer(l_y_xa, n_y, init.GlorotUniform(), init.Normal(1e-3), softmax) l_y_xa_reshaped = ReshapeLayer(l_y_xa, (-1, self.sym_samples, 1, n_y)) ### Recognition q(z|x,y) ### # Concatenate the input x and y. l_x_to_z = DenseLayer(l_x_in, z_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_x_to_z = DimshuffleLayer(l_x_to_z, (0, 'x', 'x', 1)) l_y_to_z = DenseLayer(l_y_in, z_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_y_to_z = DimshuffleLayer(l_y_to_z, (0, 'x', 'x', 1)) l_z_xy = ReshapeLayer(ElemwiseSumLayer([l_x_to_z, l_y_to_z]), [-1, z_hidden[0]]) l_z_xy = NonlinearityLayer(l_z_xy, self.transf) if len(z_hidden) > 1: for hid in z_hidden[1:]: l_z_xy = DenseLayer(l_z_xy, hid, init.GlorotNormal('relu'), init.Normal(1e-3), self.transf) l_z_axy_mu = DenseLayer(l_z_xy, n_z, init.GlorotNormal(), init.Normal(1e-3), None) l_z_axy_logvar = DenseLayer(l_z_xy, n_z, init.GlorotNormal(), init.Normal(1e-3), None) l_z_xy = SampleLayer(l_z_axy_mu, l_z_axy_logvar, eq_samples=self.sym_samples) # Reshape all layers to align them for multiple samples in the lower bound calculation. l_z_axy_mu_reshaped = DimshuffleLayer(l_z_axy_mu, (0, 'x', 'x', 1)) l_z_axy_logvar_reshaped = DimshuffleLayer(l_z_axy_logvar, (0, 'x', 'x', 1)) l_z_axy_reshaped = ReshapeLayer(l_z_xy, (-1, self.sym_samples, 1, n_z)) ### Generative p(xhat|z,y) ### # Concatenate the input x and y. l_y_to_xhat = DenseLayer(l_y_in, xhat_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_y_to_xhat = DimshuffleLayer(l_y_to_xhat, (0, 'x', 'x', 1)) l_z_to_xhat = DenseLayer(l_z_xy, xhat_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_z_to_xhat = ReshapeLayer(l_z_to_xhat, (-1, self.sym_samples, 1, xhat_hidden[0])) l_xhat_zy = ReshapeLayer(ElemwiseSumLayer([l_z_to_xhat, l_y_to_xhat]), [-1, xhat_hidden[0]]) l_xhat_zy = NonlinearityLayer(l_xhat_zy, self.transf) if len(xhat_hidden) > 1: for hid in xhat_hidden[1:]: l_xhat_zy = DenseLayer(l_xhat_zy, hid, init.GlorotNormal('relu'), init.Normal(1e-3), self.transf) if x_dist == 'bernoulli': l_xhat_zy_mu_reshaped = None l_xhat_zy_logvar_reshaped = None l_xhat_zy = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), sigmoid) elif x_dist == 'multinomial': l_xhat_zy_mu_reshaped = None l_xhat_zy_logvar_reshaped = None l_xhat_zy = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), softmax) elif x_dist == 'gaussian': l_xhat_zy_mu = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), None) l_xhat_zy_logvar = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), None) l_xhat_zy = SampleLayer(l_xhat_zy_mu, l_xhat_zy_logvar, eq_samples=1) l_xhat_zy_mu_reshaped = ReshapeLayer( l_xhat_zy_mu, (-1, self.sym_samples, 1, n_x)) l_xhat_zy_logvar_reshaped = ReshapeLayer( l_xhat_zy_logvar, (-1, self.sym_samples, 1, n_x)) l_xhat_zy_reshaped = ReshapeLayer(l_xhat_zy, (-1, self.sym_samples, 1, n_x)) ### Various class variables ### self.l_x_in = l_x_in self.l_y_in = l_y_in self.l_a_mu = l_a_x_mu_reshaped self.l_a_logvar = l_a_x_logvar_reshaped self.l_a = l_a_x_reshaped self.l_z_mu = l_z_axy_mu_reshaped self.l_z_logvar = l_z_axy_logvar_reshaped self.l_z = l_z_axy_reshaped self.l_y = l_y_xa_reshaped self.l_xhat_mu = l_xhat_zy_mu_reshaped self.l_xhat_logvar = l_xhat_zy_logvar_reshaped self.l_xhat = l_xhat_zy_reshaped self.model_params = get_all_params([self.l_xhat, self.l_y]) ### Calculate networks shapes for documentation ### self.qa_shapes = self.get_model_shape(get_all_params(l_a_x)) self.qy_shapes = self.get_model_shape( get_all_params(l_y_xa))[len(self.qa_shapes) - 1:] self.qz_shapes = self.get_model_shape(get_all_params(l_z_xy)) self.px_shapes = self.get_model_shape( get_all_params(l_xhat_zy))[(len(self.qz_shapes) - 1):] ### Predefined functions for generating xhat and y ### inputs = {l_z_xy: self.sym_z, self.l_y_in: self.sym_y} outputs = get_output(self.l_xhat, inputs, deterministic=True).mean(axis=(1, 2)) inputs = [self.sym_z, self.sym_y, self.sym_samples] self.f_xhat = theano.function(inputs, outputs) inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_y, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_y = theano.function(inputs, outputs) self.y_params = get_all_params( self.l_y, trainable=True)[(len(a_hidden) + 2) * 2::] self.xhat_params = get_all_params(self.l_xhat, trainable=True)
def __init__(self, n_c, n_z, qz_hid, px_hid, enc_rnn=256, dec_rnn=256, n_l=28, nonlinearity=rectify, px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234): """ Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_c: Number of inputs. :param n_z: Number of latent. :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(RVAE, self).__init__(n_c, qz_hid + px_hid, n_z, nonlinearity) self.x_dist = x_dist self.n_x = n_c self.seq_length = n_l self.n_z = n_z self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" # Define symbolic variables for theano functions. self.sym_x = T.tensor3('x') # inputs self.sym_z = T.matrix('z') self.sym_samples = T.iscalar('samples') # MC samples self.sym_warmup = T.fscalar('warmup') # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, num_units=n, W=dist_w(hid_w), b=dist_b(init_w), nonlinearity=None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, W=init.Normal(init_w, mean=.0), b=init.Normal(init_w), nonlinearity=nonlin) logvar = DenseLayer(layer_in, n, W=init.Normal(init_w, mean=.0), b=init.Normal(init_w), nonlinearity=nonlin) # logvar = ConstrainLayer(logvar, scale=1, max=T.log(-0.999 * self.sym_warmup + 1.0999)) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar def lstm_layer(input, nunits, return_final, backwards=False, name='LSTM'): ingate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) forgetgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(5.0)) cell = Gate( W_cell=None, nonlinearity=T.tanh, W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), ) outgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) lstm = LSTMLayer(input, num_units=nunits, backwards=backwards, peepholes=False, ingate=ingate, forgetgate=forgetgate, cell=cell, outgate=outgate, name=name, only_return_final=return_final) return lstm # RNN encoder implementation l_x_in = InputLayer((None, n_l, n_c)) l_enc_forward = lstm_layer(l_x_in, enc_rnn, return_final=True, backwards=False, name='enc_forward') l_enc_backward = lstm_layer(l_x_in, enc_rnn, return_final=True, backwards=True, name='enc_backward') l_enc_concat = ConcatLayer([l_enc_forward, l_enc_backward], axis=-1) l_enc = dense_layer(l_enc_concat, enc_rnn) # # Overwrite encoder # l_enc = dense_layer(l_x_in, enc_rnn) # Recognition q(z|x) l_qz = l_enc for hid in qz_hid: l_qz = dense_layer(l_qz, hid) # Reparameterisation and sample l_qz_mu = DenseLayer(l_qz, n_z, W=init.Normal(init_w, mean=1.0), b=init.Normal(init_w), nonlinearity=None) l_qz_logvar = DenseLayer(l_qz, n_z, init.Normal(init_w), init.Normal(init_w), nonlinearity=None) l_qz = SampleLayer(l_qz_mu, l_qz_logvar, eq_samples=self.sym_samples, iw_samples=1) # Generative p(x|z) l_qz_repeat = RepeatLayer(l_qz, n=n_l) # Skip connection to encoder until warmup threshold is reached if T.ge(self.sym_warmup, 0.4): l_skip_enc_repeat = RepeatLayer(l_enc, n=n_l) l_qz_repeat = ConcatLayer([l_qz_repeat, l_skip_enc_repeat], axis=-1) l_dec_forward = lstm_layer(l_qz_repeat, dec_rnn, return_final=False, backwards=False, name='dec_forward') l_dec_backward = lstm_layer(l_qz_repeat, dec_rnn, return_final=False, backwards=True, name='dec_backward') l_dec_concat = ConcatLayer([l_dec_forward, l_dec_backward], axis=-1) l_dec = ReshapeLayer(l_dec_concat, (-1, 2 * dec_rnn)) l_dec = dense_layer(l_dec, dec_rnn) # # Overwrite decoder # l_dec = dense_layer(l_qz, n_l) # Add additional dense layers l_px = l_dec for hid in px_hid: l_px = dense_layer(l_px, hid) # Reshape the last dimension and perhaps model with a distribution if x_dist == 'bernoulli': l_px = DenseLayer(l_px, n_c, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px = DenseLayer(l_px, n_c, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px, l_px_mu, l_px_logvar = stochastic_layer( l_px, n_c, self.sym_samples, nonlin=px_nonlinearity) elif x_dist == 'linear': l_px = DenseLayer(l_px, n_c, nonlinearity=None) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_qz = ReshapeLayer(l_qz, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = DimshuffleLayer(l_qz_mu, (0, 'x', 'x', 1)) self.l_qz_logvar = DimshuffleLayer(l_qz_logvar, (0, 'x', 'x', 1)) self.l_px = DimshuffleLayer( ReshapeLayer(l_px, (-1, n_l, self.sym_samples, 1, n_c)), (0, 2, 3, 1, 4)) self.l_px_mu = DimshuffleLayer(ReshapeLayer(l_px_mu, (-1, n_l, self.sym_samples, 1, n_c)), (0, 2, 3, 1, 4)) \ if x_dist == "gaussian" else None self.l_px_logvar = DimshuffleLayer(ReshapeLayer(l_px_logvar, (-1, n_l, self.sym_samples, 1, n_c)), (0, 2, 3, 1, 4)) \ if x_dist == "gaussian" else None # Predefined functions inputs = {self.l_x_in: self.sym_x} outputs = get_output(l_qz, inputs, deterministic=True) self.f_qz = theano.function([self.sym_x, self.sym_samples], outputs, on_unused_input='warn') inputs = {l_qz: self.sym_z, self.l_x_in: self.sym_x} outputs = get_output(self.l_px, inputs, deterministic=True).mean(axis=(1, 2)) self.f_px = theano.function([self.sym_x, self.sym_z, self.sym_samples], outputs, on_unused_input='warn') if x_dist == "gaussian": outputs = get_output(self.l_px_mu, inputs, deterministic=True).mean(axis=(1, 2)) self.f_mu = theano.function( [self.sym_x, self.sym_z, self.sym_samples], outputs, on_unused_input='ignore') outputs = get_output(self.l_px_logvar, inputs, deterministic=True).mean(axis=(1, 2)) self.f_var = theano.function( [self.sym_x, self.sym_z, self.sym_samples], outputs, on_unused_input='ignore') # Define model parameters self.model_params = get_all_params([self.l_px]) self.trainable_model_params = get_all_params([self.l_px], trainable=True)
def __init__(self, n_l, n_c, n_a, n_z, n_y, qa_hid, qz_hid, qy_hid, px_hid, pa_hid, enc_rnn=256, dec_rnn=256, nonlinearity=rectify, px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234): """ Initialize an skip deep generative model consisting of discriminative classifier q(y|a,x), generative model P p(a|z,y) and p(x|a,z,y), inference model Q q(a|x) and q(z|a,x,y). Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_c: Number of inputs. :param n_a: Number of auxiliary. :param n_z: Number of latent. :param n_y: Number of classes. :param qa_hid: List of number of deterministic hidden q(a|x). :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param qy_hid: List of number of deterministic hidden q(y|a,x). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(RSDGM, self).__init__(n_c, qz_hid + px_hid, n_a + n_z, nonlinearity) self.x_dist = x_dist self.n_y = n_y self.n_c = n_c self.n_a = n_a self.n_z = n_z self.n_l = n_l self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" # Define symbolic variables for theano functions. self.sym_beta = T.scalar('beta') # scaling constant beta self.sym_x_l = T.tensor3('x_l') # labeled inputs self.sym_t_l = T.matrix('t') # labeled targets self.sym_x_u = T.tensor3('x_u') # unlabeled inputs self.sym_bs_l = T.iscalar('bs_l') # number of labeled data self.sym_samples = T.iscalar('samples') # MC samples self.sym_z = T.matrix('z') # latent variable z self.sym_a = T.matrix('a') # auxiliary variable a self.sym_warmup = T.fscalar('warmup') # warmup to dampen KL term # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar def lstm_layer(input, nunits, return_final, backwards=False, name='LSTM'): ingate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) forgetgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(5.0)) cell = Gate( W_cell=None, nonlinearity=T.tanh, W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), ) outgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) lstm = LSTMLayer(input, num_units=nunits, backwards=backwards, peepholes=False, ingate=ingate, forgetgate=forgetgate, cell=cell, outgate=outgate, name=name, only_return_final=return_final) rec = RecurrentLayer(input, nunits, W_in_to_hid=init.GlorotNormal('relu'), W_hid_to_hid=init.GlorotNormal('relu'), backwards=backwards, nonlinearity=rectify, only_return_final=return_final, name=name) return lstm # Input layers l_y_in = InputLayer((None, n_y)) l_x_in = InputLayer((None, n_l, n_c)) # RNN encoder implementation l_enc_forward = lstm_layer(l_x_in, enc_rnn, return_final=True, backwards=False, name='enc_forward') l_enc_backward = lstm_layer(l_x_in, enc_rnn, return_final=True, backwards=True, name='enc_backward') l_enc_concat = ConcatLayer([l_enc_forward, l_enc_backward]) l_enc = dense_layer(l_enc_concat, enc_rnn) # Auxiliary q(a|x) l_qa_x = l_enc for hid in qa_hid: l_qa_x = dense_layer(l_qa_x, hid) l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer( l_qa_x, n_a, self.sym_samples) # Classifier q(y|a,x) l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qy = ReshapeLayer(l_qa_to_qy, (-1, self.sym_samples, 1, qy_hid[0])) l_x_to_qy = DenseLayer(l_enc, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1)) l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]), (-1, qy_hid[0])) if batchnorm: l_qy_xa = BatchNormLayer(l_qy_xa) l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf) if len(qy_hid) > 1: for hid in qy_hid[1:]: l_qy_xa = dense_layer(l_qy_xa, hid) l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(), init.Normal(init_w), softmax) # Recognition q(z|x,a,y) l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qz = ReshapeLayer(l_qa_to_qz, (-1, self.sym_samples, 1, qz_hid[0])) l_x_to_qz = DenseLayer(l_enc, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1)) l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1)) l_qz_axy = ReshapeLayer( ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]), (-1, qz_hid[0])) if batchnorm: l_qz_axy = BatchNormLayer(l_qz_axy) l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf) if len(qz_hid) > 1: for hid in qz_hid[1:]: l_qz_axy = dense_layer(l_qz_axy, hid) l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer( l_qz_axy, n_z, 1) # Generative p(a|z,y) l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1)) l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_pa = ReshapeLayer(l_qz_to_pa, (-1, self.sym_samples, 1, pa_hid[0])) l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]), [-1, pa_hid[0]]) if batchnorm: l_pa_zy = BatchNormLayer(l_pa_zy) l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf) if len(pa_hid) > 1: for hid in pa_hid[1:]: l_pa_zy = dense_layer(l_pa_zy, hid) l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1) # Generative p(x|a,z,y) l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_px = ReshapeLayer(l_qa_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1)) l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_px = ReshapeLayer(l_qz_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_px_azy = ReshapeLayer( ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]), [-1, px_hid[0]]) if batchnorm: l_px_azy = BatchNormLayer(l_px_azy) l_px_azy = NonlinearityLayer(l_px_azy, self.transf) # RNN decoder implementation l_px_azy_repeat = RepeatLayer(l_px_azy, n=n_l) l_dec_forward = lstm_layer(l_px_azy_repeat, dec_rnn, return_final=False, backwards=False, name='dec_forward') l_dec_backward = lstm_layer(l_px_azy_repeat, dec_rnn, return_final=False, backwards=True, name='dec_backward') l_dec_concat = ConcatLayer([l_dec_forward, l_dec_backward], axis=-1) l_dec = ReshapeLayer(l_dec_concat, (-1, 2 * dec_rnn)) l_dec = dense_layer(l_dec, dec_rnn) l_px_azy = l_dec if len(px_hid) > 1: for hid in px_hid[1:]: l_px_azy = dense_layer(l_px_azy, hid) if x_dist == 'bernoulli': l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer( l_px_azy, n_c, self.sym_samples, px_nonlinearity) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_y_in = l_y_in self.l_a_in = l_qa_x self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a)) self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1)) self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1)) self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = ReshapeLayer(l_qz_axy_mu, (-1, self.sym_samples, 1, n_z)) self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar, (-1, self.sym_samples, 1, n_z)) self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y)) self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a)) self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a)) self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar, (-1, self.sym_samples, 1, n_a)) self.l_px = ReshapeLayer(l_px_azy, (-1, n_l, self.sym_samples, 1, n_c)) self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, n_l, self.sym_samples, 1, n_c)) \ if x_dist == "gaussian" else None self.l_px_logvar = ReshapeLayer(l_px_zy_logvar, (-1, n_l, self.sym_samples, 1, n_c)) \ if x_dist == "gaussian" else None # Predefined functions inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qy = theano.function(inputs, outputs) inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qa, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qa = theano.function(inputs, outputs) inputs = {l_x_in: self.sym_x_l, l_y_in: self.sym_t_l} outputs = get_output(l_qz_axy, inputs, deterministic=True) self.f_qz = theano.function( [self.sym_x_l, self.sym_t_l, self.sym_samples], outputs) inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_pa, inputs, deterministic=True) self.f_pa = theano.function( [self.sym_z, self.sym_t_l, self.sym_samples], outputs) inputs = { l_qa_x: self.sym_a, l_qz_axy: self.sym_z, l_y_in: self.sym_t_l } outputs = get_output(self.l_px, inputs, deterministic=True).mean(axis=(2, 3)) self.f_px = theano.function( [self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs) outputs = get_output(self.l_px_mu, inputs, deterministic=True).mean(axis=(2, 3)) self.f_mu = theano.function( [self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs) outputs = get_output(self.l_px_logvar, inputs, deterministic=True).mean(axis=(2, 3)) self.f_var = theano.function( [self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs) # Define model parameters self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px]) self.trainable_model_params = get_all_params( [self.l_qy, self.l_pa, self.l_px], trainable=True)