def get_kl_divergence(shape, mu, sigma, prior, sample): """ Compute KL divergence between posterior and prior. log(q(theta)) - log(p(theta)) where p(theta) = pi*N(0,sigma1) + (1-pi)*N(0,sigma2) shape = shape of the sample we want to compute the KL of mu = the mu variable used when sampling sigma= the sigma variable used when sampling prior = the prior object with parameters sample = the sample from the posterior """ #Flatten to a vector sample = tf.reshape(sample, [-1]) #Get the log probability distribution of your sampled variable #So essentially get: q( theta | mu, sigma ) posterior = Normal(mu, sigma) prior_1 = Normal(0.0, prior.sigma1) prior_2 = Normal(0.0, prior.sigma2) #get: sum( log[ q( theta | mu, sigma ) ] ) q_theta = tf.reduce_sum(posterior.log_prob(sample)) #get: sum( log[ p( theta ) ] ) for mixture prior mix1 = tf.reduce_sum(prior_1.log_prob(sample)) + tf.log(prior.pi_mix) mix2 = tf.reduce_sum(prior_2.log_prob(sample)) + tf.log(1.0 - prior.pi_mix) #Compute KL distance KL = q_theta - tf.reduce_logsumexp([mix1, mix2]) return KL
def get_KL_divergence_Sample(shape, mu, sigma, prior, Z): """ Compute KL divergence between posterior and prior. Instead of computing the real KL distance between the Prior and Variatiational posterior of the weights, we will jsut sample its value of the specific values of the sampled weights W. In this case: - Posterior: Multivariate Independent Gaussian. - Prior: Mixture model The sample of the posterior is: KL_sample = log(q(W|theta)) - log(p(W|theta_0)) where p(theta) = pi*N(0,sigma1) + (1-pi)*N(0,sigma2) Input: - mus,sigmas: - Z: Samples weights values, the hidden variables ! shape = shape of the sample we want to compute the KL of mu = the mu variable used when sampling sigma= the sigma variable used when sampling prior = the prior object with parameters sample = the sample from the posterior """ # Flatten the hidden variables (weights) Z = tf.reshape(Z, [-1]) #Get the log probability distribution of your sampled variable # Distribution of the Variational Posterior VB_distribution = Normal(mu, sigma) # Distribution of the Gaussian Components of the prior prior_1_distribution = Normal(0.0, prior.sigma1) prior_2_distribution = Normal(0.0, prior.sigma2) # Now we compute the log likelihood of those Hidden variables for their # prior and posterior. #get: sum( log[ q( theta | mu, sigma ) ] ) q_ll = tf.reduce_sum(VB_distribution.log_prob(Z)) #get: sum( log[ p( theta ) ] ) for mixture prior mix1 = tf.reduce_sum(prior_1_distribution.log_prob(Z)) + tf.log( prior.pi_mix) mix2 = tf.reduce_sum( prior_2_distribution.log_prob(Z)) + tf.log(1.0 - prior.pi_mix) p_ll = tf.reduce_logsumexp([mix1, mix2]) #Compute the sample of the KL distance as the substaction ob both KL = q_ll - p_ll return KL
def get_KL_divergence_Sample(shape, mu, sigma, prior, Z): """ Compute KL divergence between posterior and prior. Instead of computing the real KL distance between the Prior and Variatiational posterior of the weights, we will jsut sample its value of the specific values of the sampled weights W. In this case: - Posterior: Multivariate Independent Gaussian. - Prior: Mixture model The sample of the posterior is: KL_sample = log(q(W|theta)) - log(p(W|theta_0)) where p(theta) = pi*N(0,sigma1) + (1-pi)*N(0,sigma2) Input: - mus,sigmas: - Z: Samples weights values, the hidden variables ! shape = shape of the sample we want to compute the KL of mu = the mu variable used when sampling sigma= the sigma variable used when sampling prior = the prior object with parameters sample = the sample from the posterior """ # Flatten the hidden variables (weights) Z = tf.reshape(Z, [-1]) #Get the log probability distribution of your sampled variable # Distribution of the Variational Posterior VB_distribution = Normal(mu, sigma) # Distribution of the Gaussian Components of the prior prior_1_distribution = Normal(0.0, prior.sigma1) prior_2_distribution = Normal(0.0, prior.sigma2) # Now we compute the log likelihood of those Hidden variables for their # prior and posterior. #get: sum( log[ q( theta | mu, sigma ) ] ) q_ll = tf.reduce_sum(VB_distribution.log_prob(Z)) #get: sum( log[ p( theta ) ] ) for mixture prior mix1 = tf.reduce_sum(prior_1_distribution.log_prob(Z)) + tf.log(prior.pi_mix) mix2 = tf.reduce_sum(prior_2_distribution.log_prob(Z)) + tf.log(1.0 - prior.pi_mix) p_ll = tf.reduce_logsumexp([mix1,mix2]) #Compute the sample of the KL distance as the substaction ob both KL = q_ll - p_ll return KL
def KL_scale_mixture(shape, mu, sigma, prior, w): """Compute KL for scale mixture Gaussian priors shape = (n_unit, n_w) """ posterior = Normal(mu, sigma) part_post = posterior.log_prob(tf.reshape(w, [-1])) # flatten prior_1 = Normal(0., prior.sigma_1) prior_2 = Normal(0., prior.sigma_2) part_1 = tf.reduce_sum(prior_1.log_prob(w)) + tf.log(prior.pi) part_2 = tf.reduce_sum(prior_2.log_prob(w)) + tf.log(prior.pi) prior_mix = tf.stack([part_1, part_2]) KL = - tf.reduce_sum(tf.reduce_logsumexp(prior_mix, axis=0)) + \ tf.reduce_sum(part_post) return KL
def __init__(self, policy, rate, train=True): self.rate = rate self.policy = policy with tf.variable_scope('policy_estimator'): self.policy.setup() self.X = policy.X self.a = policy.a self.target = tf.placeholder(dtype='float', shape=[None, 1], name='target') self.a_pred = policy.a_pred self.var = policy.var dist = Normal(self.a_pred, self.var) self.log_probs = dist.log_prob(self.a) self.losses = self.log_probs * self.target self.loss = tf.reduce_sum(self.losses, name='loss') if train: self.opt = tf.train.RMSPropOptimizer(rate, 0.99, 0.0, 1e-6) self.grads_and_vars = self.opt.compute_gradients(self.loss) self.grads_and_vars = [(g, v) for g, v in self.grads_and_vars if g is not None] self.update = self.opt.apply_gradients(self.grads_and_vars)
def get_kl_divergence(shape, mu, sigma, prior, sample): """ Compute KL divergence between posterior and prior. log(q(theta)) - log(p(theta)) where p(theta) = pi*N(0,sigma1) + (1-pi)*N(0,sigma2) shape = shape of the sample we want to compute the KL of mu = the mu variable used when sampling sigma= the sigma variable used when sampling prior = the prior object with parameters sample = the sample from the posterior """ #Flatten to a vector sample = tf.reshape(sample, [-1]) #Get the log probability distribution of your sampled variable #So essentially get: q( theta | mu, sigma ) posterior = Normal(mu, sigma) prior_1 = Normal(0.0, prior.sigma1) prior_2 = Normal(0.0, prior.sigma2) #get: sum( log[ q( theta | mu, sigma ) ] ) q_theta = tf.reduce_sum(posterior.log_prob(sample)) #get: sum( log[ p( theta ) ] ) for mixture prior mix1 = tf.reduce_sum(prior_1.log_prob(sample)) + tf.log(prior.pi_mix) mix2 = tf.reduce_sum(prior_2.log_prob(sample)) + tf.log(1.0 - prior.pi_mix) #Compute KL distance KL = q_theta - tf.reduce_logsumexp([mix1,mix2]) return KL
def __call__( self, states, actions, next_states, initial_omega=8, training_set_size=4000, actions_one_hot=None, sess=None, summary_writer=None, ): """ :param states: Nxm matrix :param actions: Vector of all possible actions: Nx n_actions :param next_states: Nxm matrix containing the next states :param initial_omega: value of the initial omega :return: """ self.sess = sess self.training_set_size = training_set_size self.summary_writer = summary_writer train_or_test = U.get_placeholder("train_or_test", tf.bool, ()) # statistics self.Xmean_ph = U.get_placeholder( name="Xmean", dtype=self.dtype, shape=(1, self.x_dim) ) self.Ymean_ph = U.get_placeholder( name="Ymean", dtype=self.dtype, shape=(1, self.state_dim) ) self.Xstd_ph = U.get_placeholder( name="Xstd", dtype=self.dtype, shape=(1, self.x_dim) ) self.Ystd_ph = U.get_placeholder( name="Ystd", dtype=self.dtype, shape=(1, self.state_dim) ) self.X = U.get_placeholder(name="X", dtype=self.dtype, shape=(None, self.x_dim)) self.Y = U.get_placeholder( name="Y", dtype=self.dtype, shape=(None, self.state_dim) ) with tf.variable_scope(self.name): # build the action vector self.omega = tf.get_variable( dtype=self.dtype, name="omega", shape=(), initializer=tf.initializers.constant(initial_omega), ) X = self.X # - Xmean_) / Xstd_ Y = self.Y # - YMean_) / Ystd_ # build the action vector forces = self.omega * actions forces_full = tf.concat( [tf.reshape(forces[:, 0], (-1, 1)), tf.reshape(forces[:, 1], (-1, 1))], axis=0, ) batch_size = tf.shape(states)[0] x_full = tf.concat([states, states], axis=0) x_full = tf.concat([x_full, forces_full], axis=1) x_full = (x_full - self.Xmean_ph) / self.Xstd_ph next_states_full = tf.concat([next_states, next_states], axis=0) next_states_full = (next_states_full - self.Ymean_ph) / self.Ystd_ph # build the network hidden_layer_size = 10 biases = tf.get_variable( "b", [hidden_layer_size], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) W = tf.get_variable( "W", [self.x_dim, hidden_layer_size], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) x_input = U.switch(train_or_test, X, x_full) h = tf.matmul(x_input, W) h = tf.tanh(h + biases) # now we need state_dim output neurons, one for each state dimension to predict biases_out = tf.get_variable( "b_out", [self.state_dim], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) W_out = tf.get_variable( "W_out", [hidden_layer_size, self.state_dim], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) means = tf.matmul(h, W_out) + biases_out # x_input_first = x_input[:, 0:self.x_dim - 1] # forces = tf.reshape(x_input[:, self.x_dim - 1], (-1, 1)) # x_input = tf.concat([x_input_first, tf.abs(forces)], axis=1) hidden_var = 10 biases_var = tf.get_variable( "b_var", [hidden_var], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) W_var = tf.get_variable( "W_var", [self.x_dim, hidden_var], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) h = tf.nn.sigmoid(tf.matmul(x_input, W_var) + biases_var) W_out_var = tf.get_variable( "W_out_var", [hidden_var, self.state_dim], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) biases_out_var = tf.get_variable( "b_out_var", [self.state_dim], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) var = tf.exp(tf.matmul(h, W_out_var) + biases_out_var) std = tf.sqrt(var) pdf = Normal(means, std) y_output = U.switch(train_or_test, Y, next_states_full) log_prob = tf.reduce_sum(pdf.log_prob(y_output), axis=1, keepdims=True) prob = tf.reduce_prod(pdf.prob(y_output), axis=1, keepdims=True) # loss is the negative loss likelihood self.loss = -tf.reduce_mean(log_prob) self.valid_loss = -tf.reduce_mean(log_prob) self.fitting_vars = [ biases, W, biases_out, W_out, biases_var, W_var, W_out_var, biases_out_var, ] # create fitting collection for v in self.fitting_vars: tf.add_to_collection("fitting", v) opt = tf.train.AdamOptimizer() self.minimize_op = opt.minimize(self.loss, var_list=self.fitting_vars) log_prob_a0 = log_prob[0:batch_size, :] log_prob_a1 = log_prob[batch_size:, :] prob_a0 = prob[0:batch_size, :] prob_a1 = prob[batch_size:, :] self.log_prob = tf.concat([log_prob_a0, log_prob_a1], axis=1) self.prob = tf.concat([prob_a0, prob_a1], axis=1) means_list = [] var_list = [] for i in range(self.state_dim): means_a0 = tf.reshape(means[0:batch_size, i], (-1, 1)) means_a1 = tf.reshape(means[batch_size : 2 * batch_size, i], (-1, 1)) means_actions = tf.concat([means_a0, means_a1], axis=1) means_ = tf.reduce_sum( tf.multiply(means_actions, actions_one_hot), axis=1, keepdims=True ) means_list.append(means_) # same for variance var_a0 = tf.reshape(var[0:batch_size, i], (-1, 1)) var_a1 = tf.reshape(var[batch_size : 2 * batch_size, i], (-1, 1)) var_actions = tf.concat([var_a0, var_a1], axis=1) var_ = tf.reduce_sum( tf.multiply(var_actions, actions_one_hot), axis=1, keepdims=True ) var_list.append(var_) self.means = tf.concat(means_list, axis=1) self.variances = tf.concat(var_list, axis=1) self.train_or_test = train_or_test self.loss_summary = tf.summary.scalar("Loss", self.loss) self.valid_loss_summary = tf.summary.scalar("ValidLoss", self.valid_loss) return self.log_prob, self.prob
class AIRModel(object): """Generic AIR model""" def __init__(self, obs, nums, max_steps, glimpse_size, n_appearance, transition, input_encoder, glimpse_encoder, glimpse_decoder, transform_estimator, steps_predictor, output_std=1., discrete_steps=True, output_multiplier=1., explore_eps=None, debug=False, **kwargs): """Creates the model. :param obs: tf.Tensor, images :param nums: tf.Tensor, number of objects in images Note: it is not used for inference or training; could be removed from here. :param max_steps: int, maximum number of steps to take (or objects in the image) :param glimpse_size: tuple of ints, size of the attention glimpse :param n_appearance: int, number of latent variables describing an object :param transition: see :class: AIRCell :param input_encoder: see :class: AIRCell :param glimpse_encoder: see :class: AIRCell :param glimpse_decoder: see :class: AIRCell :param transform_estimator: see :class: AIRCell :param steps_predictor: see :class: AIRCell :param output_std: float, std. dev. of the output Gaussian distribution :param discrete_steps: see :class: AIRCell :param output_multiplier: float, a factor that multiplies the reconstructed glimpses :param explore_eps: see :class: AIRCell :param debug: see :class: AIRCell :param **kwargs: all other parameters are passed to AIRCell """ self.obs = obs self.nums = nums self.max_steps = max_steps self.glimpse_size = glimpse_size self.n_appearance = n_appearance self.output_std = output_std self.discrete_steps = discrete_steps self.explore_eps = explore_eps self.debug = debug with tf.variable_scope(self.__class__.__name__): self.output_multiplier = tf.Variable(output_multiplier, dtype=tf.float32, trainable=False, name='canvas_multiplier') shape = self.obs.get_shape().as_list() self.batch_size = shape[0] self.img_size = shape[1:] self._build(transition, input_encoder, glimpse_encoder, glimpse_decoder, transform_estimator, steps_predictor, kwargs) def _build(self, transition, input_encoder, glimpse_encoder, glimpse_decoder, transform_estimator, steps_predictor, kwargs): """Build the model. See __init__ for argument description""" if self.explore_eps is not None: self.explore_eps = tf.get_variable('explore_eps', initializer=self.explore_eps, trainable=False) self.cell = AIRCell(self.img_size, self.glimpse_size, self.n_appearance, transition, input_encoder, glimpse_encoder, glimpse_decoder, transform_estimator, steps_predictor, canvas_init=None, discrete_steps=self.discrete_steps, explore_eps=self.explore_eps, debug=self.debug, **kwargs) initial_state = self.cell.initial_state(self.obs) dummy_sequence = tf.zeros((self.max_steps, self.batch_size, 1), name='dummy_sequence') outputs, state = tf.nn.dynamic_rnn(self.cell, dummy_sequence, initial_state=initial_state, time_major=True) for name, output in zip(self.cell.output_names, outputs): setattr(self, name, output) self.final_state = state[-2] self.glimpse = tf.reshape(self.presence * tf.nn.sigmoid(self.glimpse), ( self.max_steps, self.batch_size, ) + tuple(self.glimpse_size)) self.canvas = tf.reshape(self.canvas, ( self.max_steps, self.batch_size, ) + tuple(self.img_size)) self.canvas *= self.output_multiplier self.final_canvas = self.canvas[-1] self.output_distrib = Normal(self.final_canvas, self.output_std) posterior_step_probs = tf.transpose(tf.squeeze(self.presence_prob)) self.num_steps_distrib = NumStepsDistribution(posterior_step_probs) self.num_step_per_sample = tf.to_float( tf.squeeze(tf.reduce_sum(self.presence, 0))) self.num_step = tf.reduce_mean(self.num_step_per_sample) self.gt_num_steps = tf.squeeze(tf.reduce_sum(self.nums, 0)) @staticmethod def _anneal_weight(init_val, final_val, anneal_type, global_step, anneal_steps, hold_for=0., steps_div=1., dtype=tf.float64): val, final, step, hold_for, anneal_steps, steps_div = (tf.cast( i, dtype) for i in (init_val, final_val, global_step, hold_for, anneal_steps, steps_div)) step = tf.maximum(step - hold_for, 0.) if anneal_type == 'exp': decay_rate = tf.pow(final / val, steps_div / anneal_steps) val = tf.train.exponential_decay(val, step, steps_div, decay_rate) elif anneal_type == 'linear': val = final + (val - final) * (1. - step / anneal_steps) else: raise NotImplementedError anneal_weight = tf.maximum(final, val) return anneal_weight def _prior_loss(self, what_prior, where_scale_prior, where_shift_prior, num_steps_prior, global_step): """Creates KL-divergence term of the loss""" with tf.variable_scope('KL_divergence'): prior_loss = Loss() if num_steps_prior is not None: if num_steps_prior.anneal is not None: with tf.variable_scope('num_steps_prior'): nsp = num_steps_prior hold_init = getattr(nsp, 'hold_init', 0.) steps_div = getattr(nsp, 'steps_div', 1.) steps_prior_success_prob = self._anneal_weight( nsp.init, nsp.final, nsp.anneal, global_step, nsp.steps, hold_init, steps_div) else: steps_prior_success_prob = num_steps_prior.init self.steps_prior_success_prob = steps_prior_success_prob with tf.variable_scope('num_steps'): prior = geometric_prior(steps_prior_success_prob, self.max_steps) num_steps_posterior_prob = self.num_steps_distrib.prob() steps_kl = tabular_kl(num_steps_posterior_prob, prior) self.kl_num_steps_per_sample = tf.squeeze( tf.reduce_sum(steps_kl, 1)) self.kl_num_steps = tf.reduce_mean( self.kl_num_steps_per_sample) tf.summary.scalar('kl_num_steps', self.kl_num_steps) weight = getattr(num_steps_prior, 'weight', 1.) prior_loss.add(self.kl_num_steps, self.kl_num_steps_per_sample, weight=weight) if num_steps_prior.analytic: # reverse cumsum of q(n) needed to compute \E_{q(n)} [ KL[ q(z|n) || p(z|n) ]] step_weight = num_steps_posterior_prob[..., 1:] step_weight = tf.transpose(step_weight, (1, 0)) step_weight = tf.cumsum(step_weight, axis=0, reverse=True) else: step_weight = tf.squeeze(self.presence) self.prior_step_weight = step_weight # # this prevents optimising the expectation with respect to q(n) # # it's similar to the maximisation step of EM: we have a pre-computed expectation # # from the E step, and now we're maximising with respect to the argument of the expectation. # self.prior_step_weight = tf.stop_gradient(self.prior_step_weight) conditional_kl_weight = 1. if what_prior is not None: with tf.variable_scope('what'): prior = Normal(what_prior.loc, what_prior.scale) posterior = Normal(self.what_loc, self.what_scale) what_kl = _kl(posterior, prior) what_kl = tf.reduce_sum(what_kl, -1) * self.prior_step_weight what_kl_per_sample = tf.reduce_sum(what_kl, 0) self.kl_what = tf.reduce_mean(what_kl_per_sample) tf.summary.scalar('kl_what', self.kl_what) prior_loss.add(self.kl_what, what_kl_per_sample, weight=conditional_kl_weight) if where_scale_prior is not None and where_shift_prior is not None: with tf.variable_scope('where'): usx, utx, usy, uty = tf.split(self.where_loc, 4, 2) ssx, stx, ssy, sty = tf.split(self.where_scale, 4, 2) us = tf.concat((usx, usy), -1) ss = tf.concat((ssx, ssy), -1) scale_distrib = Normal(us, ss) scale_prior = Normal(where_scale_prior.loc, where_scale_prior.scale) scale_kl = _kl(scale_distrib, scale_prior) ut = tf.concat((utx, uty), -1) st = tf.concat((stx, sty), -1) shift_distrib = Normal(ut, st) if 'loc' in where_shift_prior: shift_mean = where_shift_prior.loc else: shift_mean = ut shift_prior = Normal(shift_mean, where_shift_prior.scale) shift_kl = _kl(shift_distrib, shift_prior) where_kl = tf.reduce_sum(scale_kl + shift_kl, -1) * self.prior_step_weight where_kl_per_sample = tf.reduce_sum(where_kl, 0) self.kl_where = tf.reduce_mean(where_kl_per_sample) tf.summary.scalar('kl_where', self.kl_where) prior_loss.add(self.kl_where, where_kl_per_sample, weight=conditional_kl_weight) return prior_loss def _reinforce(self, importance_weight, decay_rate): """Implements REINFORCE for training the discrete probability distribution over number of steps and train-step for the baseline""" log_prob = self.num_steps_distrib.log_prob(self.num_step_per_sample) if self.baseline is not None: if not isinstance(self.baseline, tf.Tensor): self.baseline_module = self.baseline self.baseline = self.baseline_module(self.obs, self.what, self.where, self.presence, self.final_state) self.baseline_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.baseline_module.variable_scope.name) importance_weight -= self.baseline if decay_rate is not None: axes = range(len(importance_weight.get_shape())) mean, var = tf.nn.moments(tf.squeeze(importance_weight), axes=axes) self.imp_weight_moving_mean = make_moving_average( 'imp_weight_moving_mean', mean, 0., decay_rate) self.imp_weight_moving_var = make_moving_average( 'imp_weight_moving_var', var, 1., decay_rate) factor = tf.maximum(tf.sqrt(self.imp_weight_moving_var), 1.) importance_weight = (importance_weight - self.imp_weight_moving_mean) / factor self.importance_weight = importance_weight axes = range(len(self.importance_weight.get_shape())) imp_weight_mean, imp_weight_var = tf.nn.moments( self.importance_weight, axes) tf.summary.scalar('imp_weight_mean', imp_weight_mean) tf.summary.scalar('imp_weight_var', imp_weight_var) reinforce_loss_per_sample = tf.stop_gradient( self.importance_weight) * log_prob self.reinforce_loss = tf.reduce_mean(reinforce_loss_per_sample) tf.summary.scalar('reinforce_loss', self.reinforce_loss) return self.reinforce_loss def _make_baseline_train_step(self, opt, loss, baseline, baseline_vars): baseline_target = tf.stop_gradient(loss) self.baseline_loss = .5 * tf.reduce_mean( tf.square(baseline_target - baseline)) tf.summary.scalar('baseline_loss', self.baseline_loss) train_step = opt.minimize(self.baseline_loss, var_list=baseline_vars) return train_step def train_step(self, learning_rate, l2_weight=0., what_prior=None, where_scale_prior=None, where_shift_prior=None, num_steps_prior=None, use_prior=True, use_reinforce=True, baseline=None, decay_rate=None, optimizer=tf.train.RMSPropOptimizer, opt_kwargs=dict(momentum=.9, centered=True)): """Creates the train step and the global_step :param learning_rate: float or tf.Tensor :param l2_weight: float or tf.Tensor, if > 0. then adds l2 regularisation to the model :param what_prior: AttrDict or similar, with `loc` and `scale`, both floats :param where_scale_prior: AttrDict or similar, with `loc` and `scale`, both floats :param where_shift_prior: AttrDict or similar, with `loc` and `scale`, both floats :param num_steps_prior: AttrDict or similar, described as an example: >>> num_steps_prior = AttrDict( >>> anneal='exp', # type of annealing of the prior; can be 'exp', 'linear' or None >>> init=1. - 1e-7, # initial value of the prior >>> final=1e-5, # final value of the prior >>> steps_div=1e4, # relevant for exponential annealing, see :func: tf.exponential_decay >>> steps=1e5, # number of steps for annealing >>> analytic=True >>> ) `init` and `final` describe success probability values in a geometric distribution; for example `init=.9` means that the probability of taking a single step is .9, two steps is .9**2 etc. :param use_prior: boolean, if False sets the KL-divergence loss term to 0 :param use_reinforce: boolean, if False doesn't compute gradients for the number of steps :param baseline: callable or None, baseline for variance reduction of REINFORCE :param decay_rate: float, decay rate to use for exp-moving average for NVIL :return: train step and global step """ num_steps_prior['analytic'] = getattr(num_steps_prior, 'analytic', True) self.l2_weight = l2_weight self.what_prior = what_prior self.where_scale_prior = where_scale_prior self.where_shift_prior = where_shift_prior self.num_steps_prior = num_steps_prior if not hasattr(self, 'baseline'): self.baseline = baseline self.use_prior = use_prior if self.use_prior is not None: self.use_prior = tf.Variable(self.use_prior, trainable=False, name='use_prior') self.toggle_prior = self.use_prior.assign( tf.logical_not(self.use_prior)) self.use_reinforce = use_reinforce with tf.variable_scope('loss'): global_step = tf.train.get_or_create_global_step() loss = Loss() self._train_step = [] self.learning_rate = tf.Variable(learning_rate, name='learning_rate', trainable=False) make_opt = functools.partial(optimizer, **opt_kwargs) # Reconstruction Loss, - \E_q [ p(x | z, n) ] rec_loss_per_sample = -self.output_distrib.log_prob(self.obs) self.rec_loss_per_sample = tf.reduce_sum(rec_loss_per_sample, axis=(1, 2)) self.rec_loss = tf.reduce_mean(self.rec_loss_per_sample) tf.summary.scalar('rec', self.rec_loss) loss.add(self.rec_loss, self.rec_loss_per_sample) # Prior Loss, KL[ q(z, n | x) || p(z, n) ] if use_prior is not None: self.prior_loss = self._prior_loss(what_prior, where_scale_prior, where_shift_prior, num_steps_prior, global_step) tf.summary.scalar('prior', self.prior_loss.value) self.prior_weight = tf.to_float(tf.equal(self.use_prior, True)) loss.add(self.prior_loss, weight=self.prior_weight) # REINFORCE opt_loss = loss.value if use_reinforce: self.reinforce_imp_weight = self.rec_loss_per_sample if not num_steps_prior.analytic: self.reinforce_imp_weight += self.prior_loss.per_sample reinforce_loss = self._reinforce(self.reinforce_imp_weight, decay_rate) opt_loss += reinforce_loss baseline_vars = getattr(self, 'baseline_vars', []) model_vars = list( set(tf.trainable_variables()) - set(baseline_vars)) # L2 reg if l2_weight > 0.: # don't penalise biases weights = [w for w in model_vars if len(w.get_shape()) == 2] self.l2_loss = l2_weight * sum(map(tf.nn.l2_loss, weights)) opt_loss += self.l2_loss tf.summary.scalar('l2', self.l2_loss) opt = make_opt(self.learning_rate) gvs = opt.compute_gradients(opt_loss, var_list=model_vars) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): self._train_step = opt.apply_gradients(gvs, global_step=global_step) if self.use_reinforce and self.baseline is not None: baseline_opt = make_opt(10 * learning_rate) self._baseline_tran_step = self._make_baseline_train_step( baseline_opt, self.reinforce_imp_weight, self.baseline, self.baseline_vars) self._true_train_step = self._train_step self._train_step = tf.group(self._true_train_step, self._baseline_tran_step) tf.summary.scalar('num_step', self.num_step) # Metrics gradient_summaries(gvs) self.num_step_accuracy = tf.reduce_mean( tf.to_float(tf.equal(self.gt_num_steps, self.num_step_per_sample))) self.loss = loss self.opt_loss = opt_loss return self._train_step, global_step
def _build_ad_nn(self, tensor_io): from drlutils.dataflow.tensor_io import TensorIO assert (isinstance(tensor_io, TensorIO)) from drlutils.model.base import get_current_nn_context from tensorpack.tfutils.common import get_global_step_var global_step = get_global_step_var() nnc = get_current_nn_context() is_training = nnc.is_training i_state = tensor_io.getInputTensor('state') i_agentIdent = tensor_io.getInputTensor('agentIdent') i_sequenceLength = tensor_io.getInputTensor('sequenceLength') i_resetRNN = tensor_io.getInputTensor('resetRNN') l = i_state # l = tf.Print(l, [i_state, tf.shape(i_state)], 'State = ') # l = tf.Print(l, [i_agentIdent, tf.shape(i_agentIdent)], 'agentIdent = ') # l = tf.Print(l, [i_sequenceLength, tf.shape(i_sequenceLength)], 'SeqLen = ') # l = tf.Print(l, [i_resetRNN, tf.shape(i_resetRNN)], 'resetRNN = ') with tf.variable_scope('critic', reuse=nnc.reuse) as vs: def _get_cell(): cell = tf.nn.rnn_cell.BasicLSTMCell(256) # if is_training: # cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.9) return cell cell = tf.nn.rnn_cell.MultiRNNCell([_get_cell() for _ in range(1)]) rnn_outputs = self._buildRNN( l, cell, tensor_io.batchSize, i_agentIdent=i_agentIdent, i_sequenceLength=i_sequenceLength, i_resetRNN=i_resetRNN, ) rnn_outputs = tf.reshape( rnn_outputs, [-1, rnn_outputs.get_shape().as_list()[-1]]) l = rnn_outputs from ad_cur.autodrive.model.selu import fc_selu for lidx in range(2): l = fc_selu( l, 200, keep_prob=1., # 由于我们只使用传感器训练,关键信息不能丢 is_training=is_training, name='fc-{}'.format(lidx)) value = tf.layers.dense(l, 1, name='fc-value') value = tf.squeeze(value, [1], name="value") if not hasattr(self, '_weights_critic'): self._weights_critic = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) with tf.variable_scope('actor', reuse=nnc.reuse) as vs: l = tf.stop_gradient(l) l = tf.layers.dense(l, 128, activation=tf.nn.relu6, name='fc-actor') mu_steering = 0.5 * tf.layers.dense( l, 1, activation=tf.nn.tanh, name='fc-mu-steering') mu_accel = tf.layers.dense(l, 1, activation=tf.nn.tanh, name='fc-mu-accel') mus = tf.concat([mu_steering, mu_accel], axis=-1) # mus = tf.layers.dense(l, 2, activation=tf.nn.tanh, name='fc-mus') # sigmas = tf.layers.dense(l, 2, activation=tf.nn.softplus, name='fc-sigmas') # sigmas = tf.clip_by_value(sigmas, -0.001, 0.5) def saturating_sigmoid(x): """Saturating sigmoid: 1.2 * sigmoid(x) - 0.1 cut to [0, 1].""" with tf.name_scope("saturating_sigmoid", [x]): y = tf.sigmoid(x) return tf.minimum(1.0, tf.maximum(0.0, 1.2 * y - 0.1)) sigma_steering_ = 0.1 * tf.layers.dense( l, 1, activation=tf.nn.sigmoid, name='fc-sigma-steering') sigma_accel_ = 0.25 * tf.layers.dense( l, 1, activation=tf.nn.sigmoid, name='fc-sigma-accel') if not nnc.is_evaluating: sigma_beta_steering = tf.get_default_graph( ).get_tensor_by_name('actor/sigma_beta_steering:0') sigma_beta_accel = tf.get_default_graph().get_tensor_by_name( 'actor/sigma_beta_accel:0') sigma_beta_steering = tf.constant(1e-4) # sigma_beta_steering_exp = tf.train.exponential_decay(0.3, global_step, 1000, 0.5, name='sigma/beta/steering/exp') # sigma_beta_accel_exp = tf.train.exponential_decay(0.5, global_step, 5000, 0.5, name='sigma/beta/accel/exp') else: sigma_beta_steering = tf.constant(1e-4) sigma_beta_accel = tf.constant(1e-4) sigma_steering = (sigma_steering_ + sigma_beta_steering) sigma_accel = (sigma_accel_ + sigma_beta_accel) sigmas = tf.concat([sigma_steering, sigma_accel], axis=-1) # if is_training: # pass # # 如果不加sigma_beta,收敛会很慢,并且不稳定,猜测可能是以下原因: # # 1、训练前期尽量大的探索可以避免网络陷入局部最优 # # 2、前期过小的sigma会使normal_dist的log_prob过大,导致梯度更新过大,网络一开始就畸形了,很难恢复回来 # # if is_training: # sigmas += sigma_beta_steering # sigma_steering = tf.clip_by_value(sigma_steering, sigma_beta_steering, 0.5) # sigma_accel = tf.clip_by_value(sigma_accel, sigma_beta_accel, 0.5) # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5) # sigmas_orig = sigmas # sigmas = sigmas + sigma_beta_steering # sigmas = tf.minimum(sigmas + 0.1, 100) # sigmas = tf.clip_by_value(sigmas, sigma_beta_steering, 1) # sigma_steering += sigma_beta_steering # sigma_accel += sigma_beta_accel # mus = tf.concat([mu_steering, mu_accel], axis=-1) from tensorflow.contrib.distributions import Normal dists = Normal(mus, sigmas + 0.01) policy = tf.squeeze(dists.sample([1]), [0]) # 裁剪到两倍方差之内 policy = tf.clip_by_value(policy, mus - 2 * sigmas, mus + 2 * sigmas) if is_training: self._addMovingSummary( tf.reduce_mean(mu_steering, name='mu/steering/mean'), tf.reduce_mean(mu_accel, name='mu/accel/mean'), tf.reduce_mean(sigma_steering, name='sigma/steering/mean'), tf.reduce_max(sigma_steering, name='sigma/steering/max'), tf.reduce_mean(sigma_accel, name='sigma/accel/mean'), tf.reduce_max(sigma_accel, name='sigma/accel/max'), # sigma_beta_accel, # sigma_beta_steering, ) # actions = tf.Print(actions, [mus, sigmas, tf.concat([sigma_steering_, sigma_accel_], -1), actions], # 'mu/sigma/sigma.orig/act=', summarize=4) if not hasattr(self, '_weights_actor'): self._weights_actor = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) if not is_training: tensor_io.setOutputTensors(policy, value, mus, sigmas) return i_actions = tensor_io.getInputTensor("action") # i_actions = tf.Print(i_actions, [i_actions], 'actions = ') i_actions = tf.reshape(i_actions, [-1] + i_actions.get_shape().as_list()[2:]) log_probs = dists.log_prob(i_actions) # exp_v = tf.transpose( # tf.multiply(tf.transpose(log_probs), advantage)) # exp_v = tf.multiply(log_probs, advantage) i_advantage = tensor_io.getInputTensor("advantage") i_advantage = tf.reshape(i_advantage, [-1] + i_advantage.get_shape().as_list()[2:]) exp_v = log_probs * tf.expand_dims(i_advantage, -1) entropy = dists.entropy() entropy_beta = tf.get_variable( 'entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) exp_v = entropy_beta * entropy + exp_v loss_policy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1), name='loss/policy') i_futurereward = tensor_io.getInputTensor("futurereward") i_futurereward = tf.reshape(i_futurereward, [-1] + i_futurereward.get_shape().as_list()[2:]) loss_value = tf.reduce_mean(0.5 * tf.square(value - i_futurereward)) loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1), name='xentropy_loss') from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4), self._weights_critic) loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg') loss_value += loss_l2_regularizer loss_value = tf.identity(loss_value, name='loss/value') # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer]) self._addParamSummary([('.*', ['rms', 'absmax'])]) pred_reward = tf.reduce_mean(value, name='predict_reward') import tensorpack.tfutils.symbolic_functions as symbf advantage = symbf.rms(i_advantage, name='rms_advantage') self._addMovingSummary( loss_policy, loss_value, loss_entropy, pred_reward, advantage, loss_l2_regularizer, tf.reduce_mean(policy[:, 0], name='actor/steering/mean'), tf.reduce_mean(policy[:, 1], name='actor/accel/mean'), ) return loss_policy, loss_value
model_lambda = tf.exp(model_log_lambda) model_gamma = tf.exp(model_log_gamma) model_w_1 = tf.Variable(tf.zeros([n_feats, n_hidden])) model_b_1 = tf.Variable(tf.zeros([n_hidden])) model_w_2 = tf.Variable(tf.zeros([n_hidden, 1])) model_b_2 = tf.Variable(tf.zeros([])) # Compute the prediction from the network. with tf.variable_scope("prediction"): pred = tf.matmul( tf.nn.relu(tf.matmul(model_X, model_w_1) + model_b_1), model_w_2 ) + model_b_2 # Likelihood function. with tf.variable_scope("likelihood"): log_l_dist = Normal(pred, tf.reciprocal(tf.sqrt(model_gamma))) log_l = tf.reduce_sum(log_l_dist.log_prob(model_y)) # Priors. with tf.variable_scope("priors"): prior_lambda = Gamma(alpha, beta) prior_gamma = Gamma(alpha, beta) prior_w_1 = Normal( tf.zeros([n_feats, n_hidden]), tf.reciprocal(tf.sqrt(model_lambda)) ) prior_b_1 = Normal( tf.zeros([n_hidden]), tf.reciprocal(tf.sqrt(model_lambda)) ) prior_w_2 = Normal( tf.zeros([n_hidden, 1]), tf.reciprocal(tf.sqrt(model_lambda))
def __init__(self, args, d, logdir): super(dynamic_bern_emb_model, self).__init__(args, d, logdir) with tf.name_scope('model'): with tf.name_scope('embeddings'): self.alpha = tf.Variable(self.alpha_init, name='alpha', trainable=self.alpha_trainable) self.rho_t = {} for t in range(-1, self.T): self.rho_t[t] = tf.Variable( self.rho_init + 0.001 * tf.random_normal([self.L, self.K]) / self.K, name='rho_' + str(t)) with tf.name_scope('priors'): global_prior = Normal(loc=0.0, scale=self.sig) local_prior = Normal(loc=0.0, scale=self.sig / 100.0) self.log_prior = tf.reduce_sum( global_prior.log_prob(self.alpha)) self.log_prior = tf.reduce_sum( global_prior.log_prob(self.rho_t[-1])) for t in range(self.T): self.log_prior += tf.reduce_sum( local_prior.log_prob(self.rho_t[t] - self.rho_t[t - 1])) with tf.name_scope('likelihood'): self.placeholders = {} self.y_pos = {} self.y_neg = {} self.ll_pos = 0.0 self.ll_neg = 0.0 for t in range(self.T): # Index Masks p_mask = tf.range(int(self.cs / 2), self.n_minibatch[t] + int(self.cs / 2)) rows = tf.tile( tf.expand_dims(tf.range(0, int(self.cs / 2)), [0]), [self.n_minibatch[t], 1]) columns = tf.tile( tf.expand_dims(tf.range(0, self.n_minibatch[t]), [1]), [1, int(self.cs / 2)]) ctx_mask = tf.concat([ rows + columns, rows + columns + int(self.cs / 2) + 1 ], 1) # Data Placeholder self.placeholders[t] = tf.placeholder( tf.int32, shape=(self.n_minibatch[t] + self.cs)) # Taget and Context Indices p_idx = tf.gather(self.placeholders[t], p_mask) ctx_idx = tf.squeeze( tf.gather(self.placeholders[t], ctx_mask)) # Negative samples unigram_logits = tf.tile( tf.expand_dims(tf.log(tf.constant(self.unigram)), [0]), [self.n_minibatch[t], 1]) n_idx = tf.multinomial(unigram_logits, self.ns) # Context vectors ctx_alphas = tf.gather(self.alpha, ctx_idx) p_rho = tf.squeeze(tf.gather(self.rho_t[t], p_idx)) n_rho = tf.gather(self.rho_t[t], n_idx) # Natural parameter ctx_sum = tf.reduce_sum(ctx_alphas, [1]) p_eta = tf.expand_dims( tf.reduce_sum(tf.multiply(p_rho, ctx_sum), -1), 1) n_eta = tf.reduce_sum( tf.multiply( n_rho, tf.tile(tf.expand_dims(ctx_sum, 1), [1, self.ns, 1])), -1) # Conditional likelihood self.y_pos[t] = Bernoulli(logits=p_eta) self.y_neg[t] = Bernoulli(logits=n_eta) self.ll_pos += tf.reduce_sum(self.y_pos[t].log_prob(1.0)) self.ll_neg += tf.reduce_sum(self.y_neg[t].log_prob(0.0)) self.loss = -(self.n_epochs * (self.ll_pos + self.ll_neg) + self.log_prior)
model_log_alpha = tf.Variable(tf.zeros([])) model_alpha = tf.exp(model_log_alpha) # Compute prior. with tf.variable_scope("priors"): w_prior = Normal(tf.zeros([n_feats, 1]), tf.reciprocal(tf.sqrt(model_alpha))) alpha_prior = Gamma(1., 0.01) # Compute the likelihood function. with tf.variable_scope("likelihood"): logits = tf.matmul(model_X, model_w) log_l = -tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits(labels=model_y, logits=logits)) # Compute the log-posterior of the model. log_p = (log_l * (n_train / n_batch) + tf.reduce_sum(w_prior.log_prob(model_w)) + alpha_prior.log_prob(model_alpha)) def evaluate(sampler, data_feed): """Evaluate the performance of the Bayesian neural network by computing its accuracy on the test set. """ # Average predictions across particles. logits_pred = sampler.function_posterior(logits, data_feed) # avg_pred = np.mean(1. / (1. + np.exp(-logits_pred)), axis=0) > 0.5 avg_pred = logits_pred.mean(axis=0) > 0. # Evaluation. return np.mean(avg_pred == y_test.ravel())
def __init__(self, d, K, sig, sess, logdir): self.K = K self.sig = sig self.sess = sess self.logdir = logdir with tf.name_scope('model'): # Data Placeholder with tf.name_scope('input'): self.placeholders = tf.placeholder(tf.int32) self.words = self.placeholders # Index Masks with tf.name_scope('context_mask'): self.p_mask = tf.cast( tf.range(d.cs / 2, d.n_minibatch + d.cs / 2), tf.int32) rows = tf.cast( tf.tile(tf.expand_dims(tf.range(0, d.cs / 2), [0]), [d.n_minibatch, 1]), tf.int32) columns = tf.cast( tf.tile(tf.expand_dims(tf.range(0, d.n_minibatch), [1]), [1, d.cs / 2]), tf.int32) self.ctx_mask = tf.concat( [rows + columns, rows + columns + d.cs / 2 + 1], 1) with tf.name_scope('embeddings'): # Embedding vectors self.rho = tf.Variable(tf.random_normal([d.L, self.K]) / self.K, name='rho') # Context vectors self.alpha = tf.Variable(tf.random_normal([d.L, self.K]) / self.K, name='alpha') with tf.name_scope('priors'): prior = Normal(loc=0.0, scale=self.sig) self.log_prior = tf.reduce_sum( prior.log_prob(self.rho) + prior.log_prob(self.alpha)) with tf.name_scope('natural_param'): # Taget and Context Indices with tf.name_scope('target_word'): self.p_idx = tf.gather(self.words, self.p_mask) self.p_rho = tf.squeeze(tf.gather(self.rho, self.p_idx)) # Negative samples with tf.name_scope('negative_samples'): unigram_logits = tf.tile( tf.expand_dims(tf.log(tf.constant(d.unigram)), [0]), [d.n_minibatch, 1]) self.n_idx = tf.multinomial(unigram_logits, d.ns) self.n_rho = tf.gather(self.rho, self.n_idx) with tf.name_scope('context'): self.ctx_idx = tf.squeeze( tf.gather(self.words, self.ctx_mask)) self.ctx_alphas = tf.gather(self.alpha, self.ctx_idx) # Natural parameter ctx_sum = tf.reduce_sum(self.ctx_alphas, [1]) self.p_eta = tf.expand_dims( tf.reduce_sum(tf.multiply(self.p_rho, ctx_sum), -1), 1) self.n_eta = tf.reduce_sum( tf.multiply( self.n_rho, tf.tile(tf.expand_dims(ctx_sum, 1), [1, d.ns, 1])), -1) # Conditional likelihood self.y_pos = Bernoulli(logits=self.p_eta) self.y_neg = Bernoulli(logits=self.n_eta) self.ll_pos = tf.reduce_sum(self.y_pos.log_prob(1.0)) self.ll_neg = tf.reduce_sum(self.y_neg.log_prob(0.0)) self.log_likelihood = self.ll_pos + self.ll_neg scale = 1.0 * d.N / d.n_minibatch self.loss = -(scale * self.log_likelihood + self.log_prior) # Training optimizer = tf.train.AdamOptimizer() self.train = optimizer.minimize(self.loss) with self.sess.as_default(): tf.global_variables_initializer().run() variable_summaries('rho', self.rho) variable_summaries('alpha', self.alpha) with tf.name_scope('objective'): tf.summary.scalar('loss', self.loss) tf.summary.scalar('priors', self.log_prior) tf.summary.scalar('ll_pos', self.ll_pos) tf.summary.scalar('ll_neg', self.ll_neg) self.summaries = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter(self.logdir, self.sess.graph) self.saver = tf.train.Saver() config = projector.ProjectorConfig() alpha = config.embeddings.add() alpha.tensor_name = 'model/embeddings/alpha' alpha.metadata_path = '../vocab.tsv' rho = config.embeddings.add() rho.tensor_name = 'model/embeddings/rho' rho.metadata_path = '../vocab.tsv' projector.visualize_embeddings(self.train_writer, config)
def main(_): opts = Options(save_path=FLAGS.save_path, train_biom=FLAGS.train_biom, test_biom=FLAGS.test_biom, train_metadata=FLAGS.train_metadata, test_metadata=FLAGS.test_metadata, formula=FLAGS.formula, tree=FLAGS.tree, learning_rate=FLAGS.learning_rate, clipping_size=FLAGS.clipping_size, beta_mean=FLAGS.beta_mean, beta_scale=FLAGS.beta_scale, gamma_mean=FLAGS.gamma_mean, gamma_scale=FLAGS.gamma_scale, epochs_to_train=FLAGS.epochs_to_train, num_neg_samples=FLAGS.num_neg_samples, batch_size=FLAGS.batch_size, min_sample_count=FLAGS.min_sample_count, min_feature_count=FLAGS.min_feature_count, statistics_interval=FLAGS.statistics_interval, summary_interval=FLAGS.summary_interval, checkpoint_interval=FLAGS.checkpoint_interval) # preprocessing train_table, train_metadata = opts.train_table, opts.train_metadata train_metadata = train_metadata.loc[train_table.ids(axis='sample')] sample_filter = lambda val, id_, md: ( (id_ in train_metadata.index) and np.sum(val) > opts.min_sample_count) read_filter = lambda val, id_, md: np.sum(val) > opts.min_feature_count metadata_filter = lambda val, id_, md: id_ in train_metadata.index train_table = train_table.filter(metadata_filter, axis='sample') train_table = train_table.filter(sample_filter, axis='sample') train_table = train_table.filter(read_filter, axis='observation') train_metadata = train_metadata.loc[train_table.ids(axis='sample')] sort_f = lambda xs: [xs[train_metadata.index.get_loc(x)] for x in xs] train_table = train_table.sort(sort_f=sort_f, axis='sample') train_metadata = dmatrix(opts.formula, train_metadata, return_type='dataframe') tree = opts.tree train_table, tree = match_tips(train_table, tree) basis, _ = sparse_balance_basis(tree) basis = basis.T # hold out data preprocessing test_table, test_metadata = opts.test_table, opts.test_metadata metadata_filter = lambda val, id_, md: id_ in test_metadata.index obs_lookup = set(train_table.ids(axis='observation')) feat_filter = lambda val, id_, md: id_ in obs_lookup test_table = test_table.filter(metadata_filter, axis='sample') test_table = test_table.filter(feat_filter, axis='observation') test_metadata = test_metadata.loc[test_table.ids(axis='sample')] sort_f = lambda xs: [xs[test_metadata.index.get_loc(x)] for x in xs] test_table = test_table.sort(sort_f=sort_f, axis='sample') test_metadata = dmatrix(opts.formula, test_metadata, return_type='dataframe') test_table, tree = match_tips(test_table, tree) p = train_metadata.shape[1] # number of covariates G_data = train_metadata.values y_data = train_table.matrix_data.tocoo().T y_test = np.array(test_table.matrix_data.todense()).T N, D = y_data.shape save_path = opts.save_path learning_rate = opts.learning_rate batch_size = opts.batch_size gamma_mean, gamma_scale = opts.gamma_mean, opts.gamma_scale beta_mean, beta_scale = opts.beta_mean, opts.beta_scale num_neg = opts.num_neg_samples clipping_size = opts.clipping_size epoch = y_data.nnz // batch_size num_iter = int(opts.epochs_to_train * epoch) holdout_size = test_metadata.shape[0] checkpoint_interval = opts.checkpoint_interval # Model code with tf.Graph().as_default(), tf.Session() as session: with tf.device("/cpu:0"): # Place holder variables to accept input data Gpos_ph = tf.placeholder(tf.float32, [batch_size, p], name='G_pos') Gneg_ph = tf.placeholder(tf.float32, [num_neg, p], name='G_neg') G_holdout = tf.placeholder(tf.float32, [holdout_size, p], name='G_holdout') Y_holdout = tf.placeholder(tf.float32, [holdout_size, D], name='Y_holdout') Y_ph = tf.placeholder(tf.float32, [batch_size], name='Y_ph') pos_row = tf.placeholder(tf.int32, shape=[batch_size], name='pos_row') pos_col = tf.placeholder(tf.int32, shape=[batch_size], name='pos_col') neg_row = tf.placeholder(tf.int32, shape=[num_neg], name='neg_row') neg_col = tf.placeholder(tf.int32, shape=[num_neg], name='neg_col') neg_data = tf.zeros(shape=[num_neg], name='neg_data', dtype=tf.float32) total_zero = tf.constant(y_data.shape[0] * y_data.shape[1] - y_data.nnz, dtype=tf.float32) total_nonzero = tf.constant(y_data.nnz, dtype=tf.float32) # Define PointMass Variables first qgamma = tf.Variable(tf.random_normal([1, D - 1]), name='qgamma') qbeta = tf.Variable(tf.random_normal([p, D - 1]), name='qB') theta = tf.Variable(tf.random_normal([N, 1]), name='theta') # Distributions species bias gamma = Normal(loc=tf.zeros([1, D - 1]) + gamma_mean, scale=tf.ones([1, D - 1]) * gamma_scale, name='gamma') # regression coefficents distribution beta = Normal(loc=tf.zeros([p, D - 1]) + beta_mean, scale=tf.ones([p, D - 1]) * beta_scale, name='B') Bprime = tf.concat([qgamma, qbeta], axis=0) # Add bias terms for samples Gpos = tf.concat([tf.ones([batch_size, 1]), Gpos_ph], axis=1) Gneg = tf.concat([tf.ones([num_neg, 1]), Gneg_ph], axis=1) # Convert basis to SparseTensor psi = tf.SparseTensor(indices=np.mat([basis.row, basis.col]).transpose(), values=basis.data, dense_shape=basis.shape) V = tf.transpose( tf.sparse_tensor_dense_matmul(psi, tf.transpose(Bprime))) # sparse matrix multiplication for positive samples pos_prime = tf.reduce_sum(tf.multiply( Gpos, tf.transpose(tf.gather(V, pos_col, axis=1))), axis=1) pos_phi = tf.reshape(tf.gather(theta, pos_row), shape=[batch_size]) + pos_prime Y = Poisson(log_rate=pos_phi, name='Y') # sparse matrix multiplication for negative samples neg_prime = tf.reduce_sum(tf.multiply( Gneg, tf.transpose(tf.gather(V, neg_col, axis=1))), axis=1) neg_phi = tf.reshape(tf.gather(theta, neg_row), shape=[num_neg]) + neg_prime neg_poisson = Poisson(log_rate=neg_phi, name='neg_counts') loss = -( tf.reduce_sum(gamma.log_prob(qgamma)) + \ tf.reduce_sum(beta.log_prob(qbeta)) + \ tf.reduce_sum(Y.log_prob(Y_ph)) * (total_nonzero / batch_size) + \ tf.reduce_sum(neg_poisson.log_prob(neg_data)) * (total_zero / num_neg) ) optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.9) gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm(gradients, clipping_size) train = optimizer.apply_gradients(zip(gradients, variables)) with tf.name_scope('accuracy'): holdout_count = tf.reduce_sum(Y_holdout, axis=1) spred = tf.nn.softmax( tf.transpose( tf.sparse_tensor_dense_matmul( psi, tf.transpose( (tf.matmul(G_holdout, qbeta) + qgamma))))) pred = tf.reshape(holdout_count, [-1, 1]) * spred mse = tf.reduce_mean(tf.squeeze(tf.abs(pred - Y_holdout))) tf.summary.scalar('mean_absolute_error', mse) tf.summary.scalar('loss', loss) tf.summary.histogram('qbeta', qbeta) tf.summary.histogram('qgamma', qgamma) tf.summary.histogram('theta', theta) merged = tf.summary.merge_all() tf.global_variables_initializer().run() writer = tf.summary.FileWriter(save_path, session.graph) losses = np.array([0.] * num_iter) idx = np.arange(train_metadata.shape[0]) log_handle = open(os.path.join(save_path, 'run.log'), 'w') gen = get_batch(batch_size, N, D, y_data.data, y_data.row, y_data.col, num_neg=num_neg) start_time = time.time() last_checkpoint_time = 0 start_time = time.time() saver = tf.train.Saver() for i in range(num_iter): batch_idx = np.random.choice(idx, size=batch_size) batch = next(gen) (positive_row, positive_col, positive_data, negative_row, negative_col, negative_data) = batch feed_dict = { Y_ph: positive_data, Y_holdout: y_test.astype(np.float32), G_holdout: test_metadata.values.astype(np.float32), Gpos_ph: G_data[positive_row, :], Gneg_ph: G_data[negative_row, :], pos_row: positive_row, pos_col: positive_col, neg_row: negative_row, neg_col: negative_col } if i % 1000 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() _, summary, train_loss, grads = session.run( [train, merged, loss, gradients], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % i) writer.add_summary(summary, i) elif i % 5000 == 0: _, summary, err, train_loss, grads = session.run( [train, mse, merged, loss, gradients], feed_dict=feed_dict) writer.add_summary(summary, i) else: _, summary, train_loss, grads = session.run( [train, merged, loss, gradients], feed_dict=feed_dict) writer.add_summary(summary, i) now = time.time() if now - last_checkpoint_time > checkpoint_interval: saver.save(session, os.path.join(opts.save_path, "model.ckpt"), global_step=i) last_checkpoint_time = now losses[i] = train_loss elapsed_time = time.time() - start_time print('Elapsed Time: %f seconds' % elapsed_time) # Cross validation pred_beta = qbeta.eval() pred_gamma = qgamma.eval() mse, mrc = cross_validation(test_metadata.values, pred_beta @ basis.T, pred_gamma @ basis.T, y_test) print("MSE: %f, MRC: %f" % (mse, mrc))
emb = np.hstack((rho, alpha)) L, K = emb.shape ### Parameters relevance = tf.nn.sigmoid(tf.Variable(np.random.randn(L).astype('float32'))) print('NOT USING RELEVANCE') trunc = np.sqrt(6)/np.sqrt(K + H0) w_1 = tf.Variable(np.random.uniform( -trunc, trunc, [K, H0]).astype('float32')) trunc = np.sqrt(6)/np.sqrt(H0) w_2 = tf.Variable(np.random.uniform( -trunc, trunc, [H0, 1]).astype('float32')) ### prior on w prior = Normal(loc = 0.0, scale = lam) log_prior = tf.reduce_sum(prior.log_prob(w_1)) + tf.reduce_sum(prior.log_prob(w_2)) ### placeholders for data minibatches def extract_features(text): #takes numpy array of text and transforms it into a feature representation if len(text) == 0: return np.zeros((K)) return np.mean(emb[text], axis=0) def next_batch(file_list): indices = np.random.permutation(len(file_list))[:mb] features = np.zeros((mb, K)) for i, idx in enumerate(indices): features[i] = extract_features(np.load(file_list[idx])) return features
def __init__(self, args, d, logdir): super(amortized_bern_emb_model, self).__init__(args, d, logdir) with tf.name_scope('model'): with tf.name_scope('embeddings'): self.alpha = tf.Variable(self.alpha_init, name='alpha', trainable=self.alpha_trainable) self.rho = tf.Variable(self.rho_init, name='rho', trainable=self.rho_trainable) trunc = np.sqrt(6) / np.sqrt(self.K + self.H0) phi_init = np.random.uniform( -trunc, trunc, [self.n_states, 2 * self.K * self.H0]).astype('float32') self.phi = tf.Variable(phi_init, name='phi') self.geo_rho = {} for t, state in enumerate(d.states): self.geo_rho[state] = tf.Variable(tf.random_normal( self.rho_init.shape), trainable=False, name=state + '_rho') with tf.name_scope('priors'): prior = Normal(loc=0.0, scale=self.sig) if self.alpha_trainable: self.log_prior = tf.reduce_sum( prior.log_prob(self.rho) + tf.reduce_sum(prior.log_prob(self.alpha)) + tf.reduce_sum(prior.log_prob(self.phi))) else: self.log_prior = tf.reduce_sum( prior.log_prob(self.rho)) + tf.reduce_sum( prior.log_prob(self.phi)) local_prior = Normal(loc=0.0, scale=self.sig / 100.0) for t, state in enumerate(d.states): self.log_prior += tf.reduce_sum( local_prior.log_prob( self.rho - neural_network(self.rho, self.phi, self.K, t, self.H0, self.resnet))) self.assign_ops = d.T * [0] for t, state in enumerate(d.states): self.assign_ops[t] = self.geo_rho[state].assign( neural_network(self.rho, self.phi, self.K, t, self.H0, self.resnet)) with tf.name_scope('likelihood'): self.placeholders = {} self.y_pos = {} self.y_neg = {} self.ll_pos = 0.0 self.ll_neg = 0.0 for t, state in enumerate(self.states): # Index Masks p_mask = tf.range(self.cs / 2, self.n_minibatch[t] + self.cs / 2) rows = tf.tile( tf.expand_dims(tf.range(0, self.cs / 2), [0]), [self.n_minibatch[t], 1]) columns = tf.tile( tf.expand_dims(tf.range(0, self.n_minibatch[t]), [1]), [1, self.cs / 2]) ctx_mask = tf.concat( [rows + columns, rows + columns + self.cs / 2 + 1], 1) # Data Placeholder self.placeholders[state] = tf.placeholder( tf.int32, shape=(self.n_minibatch[t] + self.cs)) # Taget and Context Indices p_idx = tf.gather(self.placeholders[state], p_mask) ctx_idx = tf.squeeze( tf.gather(self.placeholders[state], ctx_mask)) # Negative samples unigram_logits = tf.tile( tf.expand_dims(tf.log(tf.constant(d.unigram)), [0]), [self.n_minibatch[t], 1]) n_idx = tf.multinomial(unigram_logits, self.ns) # Context vectors ctx_alphas = tf.gather(self.alpha, ctx_idx) rho_state = neural_network(self.rho, self.phi, self.K, t, self.H0, self.resnet) p_rho = tf.squeeze(tf.gather(rho_state, p_idx)) n_rho = tf.gather(rho_state, n_idx) # Natural parameter ctx_sum = tf.reduce_sum(ctx_alphas, [1]) p_eta = tf.expand_dims( tf.reduce_sum(tf.multiply(p_rho, ctx_sum), -1), 1) n_eta = tf.reduce_sum( tf.multiply( n_rho, tf.tile(tf.expand_dims(ctx_sum, 1), [1, self.ns, 1])), -1) # Conditional likelihood self.y_pos[state] = Bernoulli(logits=p_eta) self.y_neg[state] = Bernoulli(logits=n_eta) self.ll_pos += tf.reduce_sum( self.y_pos[state].log_prob(1.0)) self.ll_neg += tf.reduce_sum( self.y_neg[state].log_prob(0.0)) self.loss = -(self.n_epochs * (self.ll_pos + self.ll_neg) + self.log_prior) self.init_eval_model()
def __call__(self, session, trainX, trainY, testX, testY): """ Initialize the actual graph Parameters ---------- session : tf.Session Tensorflow session trainX : np.array Input training design matrix. trainY : np.array Output training OTU table, where rows are samples and columns are observations. testX : np.array Input testing design matrix. testY : np.array Output testing OTU table, where rows are samples and columns are observations. """ self.session = session self.N, self.p = trainX.shape self.D = trainY.shape[1] holdout_size = testX.shape[0] # Place holder variables to accept input data self.X_ph = tf.constant(trainX, dtype=tf.float32, name='G_ph') self.Y_ph = tf.constant(trainY, dtype=tf.float32, name='Y_ph') self.X_holdout = tf.constant(testX, dtype=tf.float32, name='G_holdout') self.Y_holdout = tf.constant(testY, dtype=tf.float32, name='Y_holdout') batch_ids = tf.multinomial(tf.ones([1, self.N]), self.batch_size) sample_ids = tf.squeeze(batch_ids) Y_batch = tf.gather(self.Y_ph, sample_ids, axis=0) X_batch = tf.gather(self.X_ph, sample_ids, axis=0) total_count = tf.reduce_sum(Y_batch, axis=1) holdout_count = tf.reduce_sum(self.Y_holdout, axis=1) # Define PointMass Variables first self.qbeta = tf.Variable(tf.random_normal([self.p, self.D - 1]), name='qB') # regression coefficents distribution beta = Normal(loc=tf.zeros([self.p, self.D - 1]) + self.beta_mean, scale=tf.ones([self.p, self.D - 1]) * self.beta_scale, name='B') eta = tf.matmul(X_batch, self.qbeta, name='eta') phi = tf.nn.log_softmax(tf.concat( [tf.zeros([self.batch_size, 1]), eta], axis=1), name='phi') Y = Multinomial(total_count=total_count, logits=phi, name='Y') # cross validation with tf.name_scope('accuracy'): pred = tf.reshape(holdout_count, [-1, 1]) * tf.nn.softmax( tf.concat([ tf.zeros([holdout_size, 1]), tf.matmul(self.X_holdout, self.qbeta) ], axis=1), name='phi') self.cv = tf.reduce_mean(tf.squeeze(tf.abs(pred - self.Y_holdout))) tf.summary.scalar('mean_absolute_error', self.cv) self.loss = -(tf.reduce_sum(beta.log_prob(self.qbeta)) + tf.reduce_sum(Y.log_prob(Y_batch)) * (self.N / self.batch_size)) optimizer = tf.train.AdamOptimizer(self.learning_rate, beta1=self.beta_1, beta2=self.beta_2) gradients, variables = zip(*optimizer.compute_gradients(self.loss)) self.gradients, _ = tf.clip_by_global_norm(gradients, self.clipnorm) self.train = optimizer.apply_gradients(zip(gradients, variables)) tf.summary.scalar('loss', self.loss) tf.summary.histogram('qbeta', self.qbeta) self.merged = tf.summary.merge_all() if self.save_path is not None: self.writer = tf.summary.FileWriter(self.save_path, self.session.graph) else: self.writer = None tf.global_variables_initializer().run()
def F(x): return x**2 - 2 * x + 1 def get_fitness(value): return -value mean = tf.Variable(tf.constant(-30.), dtype=tf.float32) sigma = tf.Variable(tf.constant(1.), dtype=tf.float32) N_dist = Normal(loc=mean, scale=sigma) make_kids = N_dist.sample([POP_SIZE]) tfkids = tf.placeholder(tf.float32, [POP_SIZE, DNA_SIZE]) tfkids_fit = tf.placeholder(tf.float32, [POP_SIZE]) loss = -tf.reduce_mean(N_dist.log_prob(tfkids) * tfkids_fit) train_op = tf.train.GradientDescentOptimizer(LR).minimize(loss) x = np.linspace(-70, 70, 100) plt.plot(x, F(x)) plt.xlim(-70, 70) plt.ylim(-100, 1000) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) plt.ion() for g in range(N_GENERATION): kids = sess.run(make_kids) kids_fit = get_fitness(F(kids))
def __init__(self, args, d, logdir): super(completely_separate_bern_emb_model, self).__init__(args, d, logdir) with tf.name_scope('model'): with tf.name_scope('embeddings'): self.geo_alpha = {} self.geo_rho = {} for t, state in enumerate(d.states): self.geo_alpha[state] = tf.Variable(self.alpha_init + 0.001*tf.random_normal([d.L, self.K])/self.K, name = state+'_alpha') self.geo_rho[state] = tf.Variable(self.rho_init + 0.001*tf.random_normal([d.L, self.K])/self.K, name = state+'_rho') with tf.name_scope('priors'): prior = Normal(loc = 0.0, scale = self.sig) self.log_prior = 0.0 for state in d.states: self.log_prior += tf.reduce_sum(prior.log_prob(self.geo_rho[state])) self.log_prior += tf.reduce_sum(prior.log_prob(self.geo_alpha[state])) with tf.name_scope('likelihood'): self.placeholders = {} self.y_pos = {} self.y_neg = {} self.ll_pos = 0.0 self.ll_neg = 0.0 for t, state in enumerate(self.states): # Index Masks p_mask = tf.range(self.cs/2,self.n_minibatch[t] + self.cs/2) rows = tf.tile(tf.expand_dims(tf.range(0, self.cs/2),[0]), [self.n_minibatch[t], 1]) columns = tf.tile(tf.expand_dims(tf.range(0, self.n_minibatch[t]), [1]), [1, self.cs/2]) ctx_mask = tf.concat([rows+columns, rows+columns +self.cs/2+1], 1) # Data Placeholder self.placeholders[state] = tf.placeholder(tf.int32, shape = (self.n_minibatch[t] + self.cs)) # Taget and Context Indices p_idx = tf.gather(self.placeholders[state], p_mask) ctx_idx = tf.squeeze(tf.gather(self.placeholders[state], ctx_mask)) # Negative samples unigram_logits = tf.tile(tf.expand_dims(tf.log(tf.constant(d.unigram)), [0]), [self.n_minibatch[t], 1]) n_idx = tf.multinomial(unigram_logits, self.ns) # Context vectors ctx_alphas = tf.gather(self.geo_alpha[state], ctx_idx) p_rho = tf.squeeze(tf.gather(self.geo_rho[state], p_idx)) n_rho = tf.gather(self.geo_rho[state], n_idx) # Natural parameter ctx_sum = tf.reduce_sum(ctx_alphas,[1]) p_eta = tf.expand_dims(tf.reduce_sum(tf.multiply(p_rho, ctx_sum),-1),1) n_eta = tf.reduce_sum(tf.multiply(n_rho, tf.tile(tf.expand_dims(ctx_sum,1),[1,self.ns,1])),-1) # Conditional likelihood self.y_pos[state] = Bernoulli(logits = p_eta) self.y_neg[state] = Bernoulli(logits = n_eta) self.ll_pos += tf.reduce_sum(self.y_pos[state].log_prob(1.0)) self.ll_neg += tf.reduce_sum(self.y_neg[state].log_prob(0.0)) self.loss = - (self.n_epochs * (self.ll_pos + self.ll_neg) + self.log_prior) self.init_eval_model()
#mean_squared_error RSEcost = tf.reduce_mean( tf.square(y - y_mu)) # use square error for cost function # #negative log-likelihood (same as maximum-likelihood) # y_sigma = tf.sqrt(tfmixedmodel(Xtf, tf.square(std_encoder1), Ztf, tf.square(std_encoder2))) # NLLcost = - tf.reduce_sum(-0.5 * tf.log(2. * np.pi) - tf.log(y_sigma) # -0.5 * tf.square((y - y_mu)/y_sigma)) #Mean-field Variational inference using ELBO p_log_prob = [0.0] * n_samples q_log_prob = [0.0] * n_samples for s in range(n_samples): beta_tf_copy = Normal(loc=beta_mu, scale=std_encoder1) beta_sample = beta_tf_copy.sample() q_log_prob[s] += tf.reduce_sum(beta_tf.log_prob(beta_sample)) b_tf_copy = Normal(loc=b_mu, scale=std_encoder2) b_sample = b_tf_copy.sample() q_log_prob[s] += tf.reduce_sum(b_tf.log_prob(b_sample)) priormodel = Normal(loc=priormu, scale=priorsigma) y_sample = tf.matmul(Xtf, beta_sample) + tf.matmul(Ztf, b_sample) p_log_prob[s] += tf.reduce_sum(priormodel.log_prob(beta_sample)) p_log_prob[s] += tf.reduce_sum(priormodel.log_prob(b_sample)) modelcopy = Normal(loc=y_sample, scale=priorliksigma) p_log_prob[s] += tf.reduce_sum(modelcopy.log_prob(y)) p_log_prob = tf.stack(p_log_prob) q_log_prob = tf.stack(q_log_prob) ELBO = -tf.reduce_mean(p_log_prob - q_log_prob)
def main(_): opts = Options(save_path=FLAGS.save_path, train_biom=FLAGS.train_biom, test_biom=FLAGS.test_biom, train_metadata=FLAGS.train_metadata, test_metadata=FLAGS.test_metadata, formula=FLAGS.formula, learning_rate=FLAGS.learning_rate, clipping_size=FLAGS.clipping_size, beta_mean=FLAGS.beta_mean, beta_scale=FLAGS.beta_scale, gamma_mean=FLAGS.gamma_mean, gamma_scale=FLAGS.gamma_scale, epochs_to_train=FLAGS.epochs_to_train, num_neg_samples=FLAGS.num_neg_samples, batch_size=FLAGS.batch_size, min_sample_count=FLAGS.min_sample_count, min_feature_count=FLAGS.min_feature_count, statistics_interval=FLAGS.statistics_interval, summary_interval=FLAGS.summary_interval, checkpoint_interval=FLAGS.checkpoint_interval) # preprocessing train_table, train_metadata = opts.train_table, opts.train_metadata train_metadata = train_metadata.loc[train_table.ids(axis='sample')] sample_filter = lambda val, id_, md: ( (id_ in train_metadata.index) and np.sum(val) > opts.min_sample_count) read_filter = lambda val, id_, md: np.sum(val) > opts.min_feature_count metadata_filter = lambda val, id_, md: id_ in train_metadata.index train_table = train_table.filter(metadata_filter, axis='sample') train_table = train_table.filter(sample_filter, axis='sample') train_table = train_table.filter(read_filter, axis='observation') train_metadata = train_metadata.loc[train_table.ids(axis='sample')] sort_f = lambda xs: [xs[train_metadata.index.get_loc(x)] for x in xs] train_table = train_table.sort(sort_f=sort_f, axis='sample') train_metadata = dmatrix(opts.formula, train_metadata, return_type='dataframe') # hold out data preprocessing test_table, test_metadata = opts.test_table, opts.test_metadata metadata_filter = lambda val, id_, md: id_ in test_metadata.index obs_lookup = set(train_table.ids(axis='observation')) feat_filter = lambda val, id_, md: id_ in obs_lookup test_table = test_table.filter(metadata_filter, axis='sample') test_table = test_table.filter(feat_filter, axis='observation') test_metadata = test_metadata.loc[test_table.ids(axis='sample')] sort_f = lambda xs: [xs[test_metadata.index.get_loc(x)] for x in xs] test_table = test_table.sort(sort_f=sort_f, axis='sample') test_metadata = dmatrix(opts.formula, test_metadata, return_type='dataframe') p = train_metadata.shape[1] # number of covariates G_data = train_metadata.values y_data = np.array(train_table.matrix_data.todense()).T y_test = np.array(test_table.matrix_data.todense()).T N, D = y_data.shape save_path = opts.save_path learning_rate = opts.learning_rate batch_size = opts.batch_size gamma_mean, gamma_scale = opts.gamma_mean, opts.gamma_scale beta_mean, beta_scale = opts.beta_mean, opts.beta_scale num_iter = (N // batch_size) * opts.epochs_to_train holdout_size = test_metadata.shape[0] checkpoint_interval = opts.checkpoint_interval # Model code with tf.Graph().as_default(), tf.Session() as session: with tf.device("/cpu:0"): # Place holder variables to accept input data G_ph = tf.placeholder(tf.float32, [batch_size, p], name='G_ph') Y_ph = tf.placeholder(tf.float32, [batch_size, D], name='Y_ph') G_holdout = tf.placeholder(tf.float32, [holdout_size, p], name='G_holdout') Y_holdout = tf.placeholder(tf.float32, [holdout_size, D], name='Y_holdout') total_count = tf.placeholder(tf.float32, [batch_size], name='total_count') # Define PointMass Variables first qgamma = tf.Variable(tf.random_normal([1, D]), name='qgamma') qbeta = tf.Variable(tf.random_normal([p, D]), name='qB') # Distributions # species bias gamma = Normal(loc=tf.zeros([1, D]) + gamma_mean, scale=tf.ones([1, D]) * gamma_scale, name='gamma') # regression coefficents distribution beta = Normal(loc=tf.zeros([p, D]) + beta_mean, scale=tf.ones([p, D]) * beta_scale, name='B') Bprime = tf.concat([qgamma, qbeta], axis=0) # add bias terms for samples Gprime = tf.concat([tf.ones([batch_size, 1]), G_ph], axis=1) eta = tf.matmul(Gprime, Bprime) phi = tf.nn.log_softmax(eta) Y = Multinomial(total_count=total_count, logits=phi, name='Y') loss = -(tf.reduce_mean(gamma.log_prob(qgamma)) + \ tf.reduce_mean(beta.log_prob(qbeta)) + \ tf.reduce_mean(Y.log_prob(Y_ph)) * (N / batch_size)) loss = tf.Print(loss, [loss]) optimizer = tf.train.AdamOptimizer(learning_rate) gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm(gradients, opts.clipping_size) train = optimizer.apply_gradients(zip(gradients, variables)) with tf.name_scope('accuracy'): holdout_count = tf.reduce_sum(Y_holdout, axis=1) pred = tf.reshape(holdout_count, [-1, 1]) * tf.nn.softmax( tf.matmul(G_holdout, qbeta) + qgamma) mse = tf.reduce_mean(tf.squeeze(tf.abs(pred - Y_holdout))) tf.summary.scalar('mean_absolute_error', mse) tf.summary.scalar('loss', loss) tf.summary.histogram('qbeta', qbeta) tf.summary.histogram('qgamma', qgamma) merged = tf.summary.merge_all() tf.global_variables_initializer().run() writer = tf.summary.FileWriter(save_path, session.graph) losses = np.array([0.] * num_iter) idx = np.arange(train_metadata.shape[0]) log_handle = open(os.path.join(save_path, 'run.log'), 'w') last_checkpoint_time = 0 start_time = time.time() saver = tf.train.Saver() for i in range(num_iter): batch_idx = np.random.choice(idx, size=batch_size) feed_dict = { Y_ph: y_data[batch_idx].astype(np.float32), G_ph: train_metadata.values[batch_idx].astype(np.float32), Y_holdout: y_test.astype(np.float32), G_holdout: test_metadata.values.astype(np.float32), total_count: y_data[batch_idx].sum(axis=1).astype(np.float32) } if i % 1000 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() _, summary, train_loss, grads = session.run( [train, merged, loss, gradients], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % i) writer.add_summary(summary, i) elif i % 5000 == 0: _, summary, err, train_loss, grads = session.run( [train, mse, merged, loss, gradients], feed_dict=feed_dict) writer.add_summary(summary, i) else: _, summary, train_loss, grads = session.run( [train, merged, loss, gradients], feed_dict=feed_dict) writer.add_summary(summary, i) now = time.time() if now - last_checkpoint_time > checkpoint_interval: saver.save(session, os.path.join(opts.save_path, "model.ckpt"), global_step=i) last_checkpoint_time = now losses[i] = train_loss elapsed_time = time.time() - start_time print('Elapsed Time: %f seconds' % elapsed_time) # Cross validation pred_beta = qbeta.eval() pred_gamma = qgamma.eval() mse, mrc = cross_validation(test_metadata.values, pred_beta, pred_gamma, y_test) print("MSE: %f, MRC: %f" % (mse, mrc))
def __call__(self, session, trainX, trainY, testX, testY): """ Initialize the actual graph Parameters ---------- session : tf.Session Tensorflow session trainX : sparse array in coo format Test input OTU table, where rows are samples and columns are observations trainY : np.array Test output metabolite table testX : sparse array in coo format Test input OTU table, where rows are samples and columns are observations. This is mainly for cross validation. testY : np.array Test output metabolite table. This is mainly for cross validation. """ self.session = session self.nnz = len(trainX.data) self.d1 = trainX.shape[1] self.d2 = trainY.shape[1] self.cv_size = len(testX.data) # keep the multinomial sampling on the cpu # https://github.com/tensorflow/tensorflow/issues/18058 with tf.device('/cpu:0'): X_ph = tf.SparseTensor(indices=np.array([trainX.row, trainX.col]).T, values=trainX.data, dense_shape=trainX.shape) Y_ph = tf.constant(trainY, dtype=tf.float32) X_holdout = tf.SparseTensor(indices=np.array( [testX.row, testX.col]).T, values=testX.data, dense_shape=testX.shape) Y_holdout = tf.constant(testY, dtype=tf.float32) total_count = tf.reduce_sum(Y_ph, axis=1) batch_ids = tf.multinomial( tf.log(tf.reshape(X_ph.values, [1, -1])), self.batch_size) batch_ids = tf.squeeze(batch_ids) X_samples = tf.gather(X_ph.indices, 0, axis=1) X_obs = tf.gather(X_ph.indices, 1, axis=1) sample_ids = tf.gather(X_samples, batch_ids) Y_batch = tf.gather(Y_ph, sample_ids) X_batch = tf.gather(X_obs, batch_ids) with tf.device(self.device_name): self.qUmain = tf.Variable(tf.random_normal([self.d1, self.p]), name='qU') self.qUbias = tf.Variable(tf.random_normal([self.d1, 1]), name='qUbias') self.qVmain = tf.Variable(tf.random_normal([self.p, self.d2 - 1]), name='qV') self.qVbias = tf.Variable(tf.random_normal([1, self.d2 - 1]), name='qVbias') qU = tf.concat([tf.ones([self.d1, 1]), self.qUbias, self.qUmain], axis=1) qV = tf.concat( [self.qVbias, tf.ones([1, self.d2 - 1]), self.qVmain], axis=0) # regression coefficents distribution Umain = Normal(loc=tf.zeros([self.d1, self.p]) + self.u_mean, scale=tf.ones([self.d1, self.p]) * self.u_scale, name='U') Ubias = Normal(loc=tf.zeros([self.d1, 1]) + self.u_mean, scale=tf.ones([self.d1, 1]) * self.u_scale, name='biasU') Vmain = Normal(loc=tf.zeros([self.p, self.d2 - 1]) + self.v_mean, scale=tf.ones([self.p, self.d2 - 1]) * self.v_scale, name='V') Vbias = Normal(loc=tf.zeros([1, self.d2 - 1]) + self.v_mean, scale=tf.ones([1, self.d2 - 1]) * self.v_scale, name='biasV') du = tf.gather(qU, X_batch, axis=0, name='du') dv = tf.concat([tf.zeros([self.batch_size, 1]), du @ qV], axis=1, name='dv') tc = tf.gather(total_count, sample_ids) Y = Multinomial(total_count=tc, logits=dv, name='Y') num_samples = trainX.shape[0] norm = num_samples / self.batch_size logprob_vmain = tf.reduce_sum(Vmain.log_prob(self.qVmain), name='logprob_vmain') logprob_vbias = tf.reduce_sum(Vbias.log_prob(self.qVbias), name='logprob_vbias') logprob_umain = tf.reduce_sum(Umain.log_prob(self.qUmain), name='logprob_umain') logprob_ubias = tf.reduce_sum(Ubias.log_prob(self.qUbias), name='logprob_ubias') logprob_y = tf.reduce_sum(Y.log_prob(Y_batch), name='logprob_y') self.log_loss = -(logprob_y * norm + logprob_umain + logprob_ubias + logprob_vmain + logprob_vbias) # keep the multinomial sampling on the cpu # https://github.com/tensorflow/tensorflow/issues/18058 with tf.device('/cpu:0'): # cross validation with tf.name_scope('accuracy'): cv_batch_ids = tf.multinomial( tf.log(tf.reshape(X_holdout.values, [1, -1])), self.cv_size) cv_batch_ids = tf.squeeze(cv_batch_ids) X_cv_samples = tf.gather(X_holdout.indices, 0, axis=1) X_cv = tf.gather(X_holdout.indices, 1, axis=1) cv_sample_ids = tf.gather(X_cv_samples, cv_batch_ids) Y_cvbatch = tf.gather(Y_holdout, cv_sample_ids) X_cvbatch = tf.gather(X_cv, cv_batch_ids) holdout_count = tf.reduce_sum(Y_cvbatch, axis=1) cv_du = tf.gather(qU, X_cvbatch, axis=0, name='cv_du') pred = tf.reshape(holdout_count, [-1, 1]) * tf.nn.softmax( tf.concat([tf.zeros([self.cv_size, 1]), cv_du @ qV], axis=1, name='pred')) self.cv = tf.reduce_mean(tf.squeeze(tf.abs(pred - Y_cvbatch))) # keep all summaries on the cpu with tf.device('/cpu:0'): tf.summary.scalar('logloss', self.log_loss) tf.summary.scalar('cv_rmse', self.cv) tf.summary.histogram('qUmain', self.qUmain) tf.summary.histogram('qVmain', self.qVmain) tf.summary.histogram('qUbias', self.qUbias) tf.summary.histogram('qVbias', self.qVbias) self.merged = tf.summary.merge_all() self.writer = tf.summary.FileWriter(self.save_path, self.session.graph) with tf.device(self.device_name): with tf.name_scope('optimize'): optimizer = tf.train.AdamOptimizer(self.learning_rate, beta1=self.beta_1, beta2=self.beta_2) gradients, self.variables = zip( *optimizer.compute_gradients(self.log_loss)) self.gradients, _ = tf.clip_by_global_norm( gradients, self.clipnorm) self.train = optimizer.apply_gradients( zip(self.gradients, self.variables)) tf.global_variables_initializer().run()
class AIRModel(object): def __init__(self, obs, nums, max_steps, glimpse_size, n_appearance, transition, input_encoder, glimpse_encoder, glimpse_decoder, transform_estimator, steps_predictor, output_std=1., discrete_steps=True, step_bias=0., explore_eps=None, debug=False): self.obs = obs self.nums = nums self.max_steps = max_steps self.glimpse_size = glimpse_size self.n_appearance = n_appearance self.output_std = output_std self.discrete_steps = discrete_steps self.step_bias = step_bias self.explore_eps = explore_eps self.debug = debug with tf.variable_scope(self.__class__.__name__): shape = self.obs.get_shape().as_list() self.batch_size = shape[0] self.img_size = shape[1:] self._build(transition, input_encoder, glimpse_encoder, glimpse_decoder, transform_estimator, steps_predictor) def _build(self, transition, input_encoder, glimpse_encoder, glimpse_decoder, transform_estimator, steps_predictor): if self.explore_eps is not None: self.explore_eps = tf.get_variable('explore_eps', initializer=self.explore_eps, trainable=False) self.cell = AIRCell(self.img_size, self.glimpse_size, self.n_appearance, transition, input_encoder, glimpse_encoder, glimpse_decoder, transform_estimator, steps_predictor, canvas_init=None, discrete_steps=self.discrete_steps, explore_eps=self.explore_eps, debug=self.debug) initial_state = self.cell.initial_state(self.obs) dummy_sequence = tf.zeros((self.max_steps, self.batch_size, 1), name='dummy_sequence') outputs, state = tf.nn.dynamic_rnn(self.cell, dummy_sequence, initial_state=initial_state, time_major=True) for name, output in zip(self.cell.output_names, outputs): setattr(self, name, output) # canvas, glimpse, what, what_loc, what_scale, where, where_loc, where_scale, presence_prob, presence = outputs self.glimpse = tf.reshape(self.presence * tf.nn.sigmoid(self.glimpse), ( self.max_steps, self.batch_size, ) + tuple(self.glimpse_size)) self.canvas = tf.reshape(self.canvas, ( self.max_steps, self.batch_size, ) + tuple(self.img_size)) self.final_canvas = self.canvas[-1] self.output_distrib = Normal(self.final_canvas, self.output_std) posterior_step_probs = tf.transpose(tf.squeeze(self.presence_prob)) self.num_steps_distrib = NumStepsDistribution(posterior_step_probs) self.num_step_per_sample = tf.to_float( tf.squeeze(tf.reduce_sum(self.presence, 0))) self.num_step = tf.reduce_mean(self.num_step_per_sample) self.gt_num_steps = tf.squeeze(tf.reduce_sum(self.nums, 0)) def _prior_loss(self, appearance_prior, where_scale_prior, where_shift_prior, num_steps_prior, global_step): with tf.variable_scope('prior_loss'): prior_loss = Loss() if num_steps_prior is not None: if num_steps_prior.anneal is not None: with tf.variable_scope('num_steps_prior'): nsp = num_steps_prior val = tf.get_variable('value', initializer=num_steps_prior.init, dtype=tf.float32, trainable=False) if num_steps_prior.anneal == 'exp': decay_rate = (nsp.final / nsp.init)**(float(nsp.steps_div) / nsp.steps) val = tf.train.exponential_decay( val, global_step, nsp.steps_div, decay_rate) elif num_steps_prior.anneal == 'linear': val = nsp.final + (nsp.init - nsp.final) * ( 1. - tf.to_float(global_step) / nsp.steps) num_steps_prior_value = tf.maximum(nsp.final, val) else: num_steps_prior_value = num_steps_prior.init prior = geometric_prior(num_steps_prior_value, 3) steps_kl = tabular_kl(self.num_steps_distrib.prob(), prior) num_steps_prior_loss_per_sample = tf.squeeze( tf.reduce_sum(steps_kl, 1)) self.num_steps_prior_loss = tf.reduce_mean( num_steps_prior_loss_per_sample) tf.summary.scalar('num_steps_prior', self.num_steps_prior_loss) prior_loss.add(self.num_steps_prior_loss, num_steps_prior_loss_per_sample) if appearance_prior is not None: prior = Normal(appearance_prior.loc, appearance_prior.scale) posterior = Normal(self.what_loc, self.what_scale) what_kl = _kl(posterior, prior) what_kl = tf.reduce_sum(what_kl, -1, keep_dims=True) * self.presence appearance_prior_loss_per_sample = tf.squeeze( tf.reduce_sum(what_kl, 0)) # n_samples_with_encoding = tf.reduce_sum(tf.to_float(tf.greater(num_step_per_sample, 0.))) # div = tf.maximum(n_samples_with_encoding, 1.) # appearance_prior_loss = tf.reduce_sum(latent_code_prior_loss_per_sample) / div self.appearance_prior_loss = tf.reduce_mean( appearance_prior_loss_per_sample) tf.summary.scalar('latent_code_prior', self.appearance_prior_loss) prior_loss.add(self.appearance_prior_loss, appearance_prior_loss_per_sample) usx, utx, usy, uty = tf.split(self.where_loc, 4, 2) ssx, stx, ssy, sty = tf.split(self.where_scale, 4, 2) us = tf.concat((usx, usy), -1) ss = tf.concat((ssx, ssy), -1) scale_distrib = Normal(us, ss) scale_prior = Normal(where_scale_prior.loc, where_scale_prior.scale) scale_kl = _kl(scale_distrib, scale_prior) ut = tf.concat((utx, uty), -1) st = tf.concat((stx, sty), -1) shift_distrib = Normal(ut, st) if 'loc' in where_shift_prior: shift_mean = where_shift_prior.loc else: shift_mean = ut shift_prior = Normal(shift_mean, where_shift_prior.scale) shift_kl = _kl(shift_distrib, shift_prior) where_kl = tf.reduce_sum( scale_kl + shift_kl, -1, keep_dims=True) * self.presence where_kl_per_sample = tf.reduce_sum(tf.squeeze(where_kl), 0) self.where_kl = tf.reduce_mean(where_kl_per_sample) tf.summary.scalar('where_prior', self.where_kl) prior_loss.add(self.where_kl, where_kl_per_sample) return prior_loss def _reinforce(self, loss, make_opt, baseline=None): if baseline is None: baseline = getattr(self, 'baseline', None) if callable(baseline): baseline_module = baseline self.baseline = baseline(self.obs, self.what, self.where, self.presence_prob) log_prob = self.num_steps_distrib.log_prob(self.num_step_per_sample) log_prob = tf.clip_by_value(log_prob, -1e38, 1e38) # log_prob *= -1 # cause we're maximising self.importance_weight = loss._per_sample if baseline is not None: self.importance_weight -= self.baseline reinforce_loss_per_sample = tf.stop_gradient( self.importance_weight) * log_prob self.reinforce_loss = tf.reduce_mean(reinforce_loss_per_sample) tf.summary.scalar('reinforce_loss', self.reinforce_loss) # Baseline Optimisation baseline_vars, baseline_train_step = [], None if baseline is not None: baseline_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=baseline_module.variable_scope.name) baseline_target = tf.stop_gradient(loss.per_sample) baseline_loss_per_sample = (baseline_target - self.baseline)**2 self.baseline_loss = tf.reduce_mean(baseline_loss_per_sample) tf.summary.scalar('baseline_loss', self.baseline_loss) baseline_opt = make_opt(10 * self.learning_rate) baseline_train_step = baseline_opt.minimize(self.baseline_loss, var_list=baseline_vars) return self.reinforce_loss, baseline_vars, baseline_train_step def train_step(self, learning_rate, l2_weight=0., appearance_prior=None, where_scale_prior=None, where_shift_prior=None, num_steps_prior=None, use_prior=True, use_reinforce=True, baseline=None): self.l2_weight = l2_weight self.appearance_prior = appearance_prior self.where_scale_prior = where_scale_prior self.where_shift_prior = where_shift_prior self.num_steps_prior = num_steps_prior self.use_prior = use_prior self.use_reinforce = use_reinforce with tf.variable_scope('loss'): global_step = tf.train.get_or_create_global_step() loss = Loss() self._train_step = [] self.learning_rate = tf.Variable(learning_rate, name='learning_rate', trainable=False) make_opt = lambda lr: tf.train.RMSPropOptimizer( lr, momentum=.9, centered=True) # Reconstruction Loss rec_loss_per_sample = -self.output_distrib.log_prob(self.obs) self.rec_loss_per_sample = tf.reduce_sum(rec_loss_per_sample, axis=(1, 2)) self.rec_loss = tf.reduce_mean(self.rec_loss_per_sample) tf.summary.scalar('rec', self.rec_loss) loss.add(self.rec_loss, self.rec_loss_per_sample) # Prior Loss if use_prior: self.prior_loss = self._prior_loss(appearance_prior, where_scale_prior, where_shift_prior, num_steps_prior, global_step) tf.summary.scalar('prior', self.prior_loss.value) loss.add(self.prior_loss) # REINFORCE opt_loss = loss.value baseline_vars = [] if use_reinforce: reinforce_loss, baseline_vars, baseline_train_step = self._reinforce( loss, make_opt, baseline) if baseline_train_step is not None: self._train_step.append(baseline_train_step) opt_loss += reinforce_loss model_vars = list( set(tf.trainable_variables()) - set(baseline_vars)) # L2 reg if l2_weight > 0.: # don't penalise biases weights = [w for w in model_vars if len(w.get_shape()) == 2] self.l2_loss = l2_weight * sum(map(tf.nn.l2_loss, weights)) opt_loss += self.l2_loss tf.summary.scalar('l2', self.l2_loss) opt = make_opt(self.learning_rate) gvs = opt.compute_gradients(opt_loss, var_list=model_vars) true_train_step = opt.apply_gradients(gvs, global_step=global_step) self._train_step.append(true_train_step) # Metrics gradient_summaries(gvs) self.num_step_accuracy = tf.reduce_mean( tf.to_float( tf.equal(self.gt_num_steps, self.num_step_per_sample))) self.loss = loss return self._train_step, global_step
def __init__(self, args, d, logdir): super(bern_emb_model, self).__init__(args, d, logdir) self.n_minibatch = self.n_minibatch.sum() with tf.name_scope('model'): # Data Placeholder with tf.name_scope('input'): self.placeholders = tf.placeholder(tf.int32) self.words = self.placeholders # Index Masks with tf.name_scope('context_mask'): self.p_mask = tf.cast( tf.range(int(self.cs / 2), self.n_minibatch + int(self.cs / 2)), tf.int32) rows = tf.cast( tf.tile(tf.expand_dims(tf.range(0, int(self.cs / 2)), [0]), [self.n_minibatch, 1]), tf.int32) columns = tf.cast( tf.tile(tf.expand_dims(tf.range(0, self.n_minibatch), [1]), [1, int(self.cs / 2)]), tf.int32) self.ctx_mask = tf.concat( [rows + columns, rows + columns + int(self.cs / 2) + 1], 1) with tf.name_scope('embeddings'): self.rho = tf.Variable(self.rho_init, name='rho') self.alpha = tf.Variable(self.alpha_init, name='alpha', trainable=self.alpha_trainable) with tf.name_scope('priors'): prior = Normal(loc=0.0, scale=self.sig) if self.alpha_trainable: self.log_prior = tf.reduce_sum( prior.log_prob(self.rho) + prior.log_prob(self.alpha)) else: self.log_prior = tf.reduce_sum(prior.log_prob( self.rho)) with tf.name_scope('natural_param'): # Taget and Context Indices with tf.name_scope('target_word'): self.p_idx = tf.gather(self.words, self.p_mask) self.p_rho = tf.squeeze(tf.gather(self.rho, self.p_idx)) # Negative samples with tf.name_scope('negative_samples'): unigram_logits = tf.tile( tf.expand_dims(tf.log(tf.constant(self.unigram)), [0]), [self.n_minibatch, 1]) self.n_idx = tf.multinomial(unigram_logits, self.ns) self.n_rho = tf.gather(self.rho, self.n_idx) with tf.name_scope('context'): self.ctx_idx = tf.squeeze( tf.gather(self.words, self.ctx_mask)) self.ctx_alphas = tf.gather(self.alpha, self.ctx_idx) # Natural parameter ctx_sum = tf.reduce_sum(self.ctx_alphas, [1]) self.p_eta = tf.expand_dims( tf.reduce_sum(tf.multiply(self.p_rho, ctx_sum), -1), 1) self.n_eta = tf.reduce_sum( tf.multiply( self.n_rho, tf.tile(tf.expand_dims(ctx_sum, 1), [1, self.ns, 1])), -1) # Conditional likelihood self.y_pos = Bernoulli(logits=self.p_eta) self.y_neg = Bernoulli(logits=self.n_eta) self.ll_pos = tf.reduce_sum(self.y_pos.log_prob(1.0)) self.ll_neg = tf.reduce_sum(self.y_neg.log_prob(0.0)) self.log_likelihood = self.ll_pos + self.ll_neg scale = 1.0 * self.N / self.n_minibatch self.loss = -(self.n_epochs * self.log_likelihood + self.log_prior)
def loss(self, G_data, y_data, batch): """ Computes the loss. Parameters ---------- G_data : tf.Tensor Design matrix y_data : tf.SparseTensor Sparse tensor of counts batch : tuple of results tf.Tensor The output from sample(). The tuple is decomposed as follows positive_batch : tf.SparseTensor Sparse tensor of positive examples negative_batch : tf.SparseTensor Sparse tensor of negative examples accident_batch : tf.SparseTensor Sparse tensor of accidental positive examples. These are examples that are claimed to be negative, but are actually positive. This is corrected downstream in the `inference` module. These are added to to the negative batch to correct the accident. Since Poisson(0) + Poisson(k) = Poisson(k), this should be equivalent. Blame Google for this ugly hack. num_exp_pos : int Number of expected positive hits. This is useful for scaling the minibatches appropriately. num_exp_neg : int Number of expected negative hits. This is useful for scaling the minibatches appropriately. """ with tf.name_scope('loss'): opts = self.opts (positive_batch, negative_batch, accident_batch, num_exp_pos, num_exp_neg) = batch gamma_mean, gamma_scale = opts.gamma_mean, opts.gamma_scale beta_mean, beta_scale = opts.beta_mean, opts.beta_scale N, D, p = self.N, self.D, self.p num_nonzero = tf.size(y_data.values, out_type=tf.float32) # unpack sparse tensors pos_data = positive_batch.values # nonzero examples pos_row = tf.gather(positive_batch.indices, 0, axis=1) pos_col = tf.gather(positive_batch.indices, 1, axis=1) neg_data = negative_batch.values # zero examples neg_row = tf.gather(negative_batch.indices, 0, axis=1) neg_col = tf.gather(negative_batch.indices, 1, axis=1) acc_data = accident_batch.values # accident examples acc_row = tf.gather(accident_batch.indices, 0, axis=1) acc_col = tf.gather(accident_batch.indices, 1, axis=1) batch_size, num_sampled = opts.batch_size, opts.num_neg_samples # obtain prediction to then calculate loss Gpos = tf.gather(G_data, pos_row, axis=0) y_pred = self.inference(Gpos, pos_col) theta = tf.log( tf.cast(tf.sparse_reduce_sum(y_data, axis=1), dtype=tf.float32)) qbeta, qgamma = self.qbeta, self.qgamma # Actual calculation of loss is below. # Adding sample bias y_pred += tf.reshape(tf.gather(theta, pos_row), shape=[batch_size]) total_zero = tf.constant(N * D, dtype=tf.float32) - num_nonzero total_nonzero = num_nonzero pos_poisson = Poisson(log_rate=y_pred, name='Y') # Distributions species bias gamma = Normal(loc=tf.zeros([1, D]) + gamma_mean, scale=tf.ones([1, D]) * gamma_scale, name='gamma') # regression coefficents distribution beta = Normal(loc=tf.zeros([p, D]) + beta_mean, scale=tf.ones([p, D]) * beta_scale, name='B') # sparse matrix multiplication for negative samples Gneg = tf.gather(G_data, neg_row, axis=0) Gneg = tf.concat([tf.ones([num_sampled, 1]), Gneg], axis=1) neg_prime = tf.reduce_sum(tf.multiply( Gneg, tf.transpose(tf.gather(self.V, neg_col, axis=1))), axis=1) neg_phi = tf.reshape(tf.gather(theta, neg_row), shape=[num_sampled]) + neg_prime neg_poisson = Poisson(log_rate=neg_phi, name='neg_counts') # accident samples num_acc = tf.shape(accident_batch.indices)[0] Gacc = tf.gather(G_data, acc_row, axis=0) Gacc = tf.concat([tf.ones([num_acc, 1]), Gacc], axis=1) acc_prime = tf.reduce_sum(tf.multiply( Gacc, tf.transpose(tf.gather(self.V, acc_col, axis=1))), axis=1) acc_phi = tf.reshape(tf.gather(theta, acc_row), shape=[num_acc]) + acc_prime acc_poisson = Poisson(log_rate=acc_phi, name='acc_counts') pos_data = tf.cast(pos_data, dtype=tf.float32) neg_data = tf.cast(neg_data, dtype=tf.float32) acc_data = tf.cast(acc_data, dtype=tf.float32) num_acc = tf.cast(tf.size(acc_data), tf.float32) num_pos = batch_size + num_acc num_neg = num_sampled - num_acc pos_prob = pos_poisson.log_prob(pos_data) neg_prob = neg_poisson.log_prob(neg_data) acc_prob = acc_poisson.log_prob(acc_data) total_pos = tf.reduce_sum(pos_prob) total_acc = tf.reduce_sum(acc_prob) total_neg = tf.reduce_sum(neg_prob) total_gamma = tf.reduce_sum(gamma.log_prob(qgamma)) total_beta = tf.reduce_sum(beta.log_prob(qbeta)) log_loss = - ( total_gamma + total_beta + \ (total_pos + total_acc) * (total_nonzero / num_pos) + \ (total_neg - total_acc) * (total_zero / num_neg) ) return log_loss
def _log_prob1(mean, std, targets): distribution = Normal(loc=mean, scale=std) log_prob = distribution.log_prob(targets) return log_prob
def loss(self, G_data, y_data, positive_batch, random_batch): """ Computes the loss. Parameters ---------- G_data : tf.Tensor Design matrix y_data : tf.SparseTensor Sparse tensor of counts positive_batch : tf.Tensor A Sparse tensor representing a batch of positive examples. random_batch : tf.Tensor A Sparse tensor representing a batch of random examples. Returns ------- log_loss : tf.Tensor Tensor representing the log likelihood of the model. """ with tf.name_scope('loss'): gamma_mean, gamma_scale = self.gamma_mean, self.gamma_scale beta_mean, beta_scale = self.beta_mean, self.beta_scale N, D, p = self.block_size, self.D, self.p num_nonzero = tf.cast(tf.size(y_data.values, out_type=tf.int32), dtype=tf.float32) # unpack sparse tensors pos_data = tf.cast(positive_batch.values, dtype=tf.float32) pos_row = tf.gather(positive_batch.indices, 0, axis=1) pos_col = tf.gather(positive_batch.indices, 1, axis=1) rand_row = tf.gather(random_batch.indices, 0, axis=1) rand_col = tf.gather(random_batch.indices, 1, axis=1) num_sampled = tf.size(pos_row, out_type=tf.float32) theta = tf.log( # basically log total counts tf.cast(tf.sparse_reduce_sum(y_data, axis=1), dtype=tf.float32)) # Regression coefficients qgamma = tf.Variable(tf.random_normal([1, D]), name='qgamma') qbeta = tf.Variable(tf.random_normal([p, D]), name='qbeta') self.V = tf.concat([qgamma, qbeta], axis=0, name='V') G = tf.concat([tf.ones([G_data.shape[0], 1]), G_data], axis=1, name='G') with tf.name_scope('positive_log_prob'): # add bias terms for samples Gpos = tf.gather(G, pos_row, axis=0) Vpos = tf.transpose(tf.gather(self.V, pos_col, axis=1), name='Vprime') # sparse matrix multiplication for positive samples y_pred = tf.reduce_sum(tf.multiply(Gpos, Vpos), axis=1) theta_pos = tf.squeeze(tf.gather(theta, pos_row)) pos_prob = tf.reduce_sum( tf.multiply(pos_data, y_pred + theta_pos)) sparse_scale = num_nonzero / num_sampled with tf.name_scope('coefficient_log_prob'): Grand = tf.gather(G, rand_row, axis=0) Vrand = tf.transpose(tf.gather(self.V, rand_col, axis=1), name='Vprime') # sparse matrix multiplication for random indices y_rand = tf.reduce_sum(tf.multiply(Grand, Vrand), axis=1) theta_rand = tf.squeeze(tf.gather(theta, rand_row)) coef_prob = tf.reduce_sum(tf.exp(y_rand + theta_rand)) coef_scale = N * D / self.num_neg_samples total_poisson = pos_prob * sparse_scale - coef_prob * coef_scale with tf.name_scope('priors'): # Normal priors (a.k.a. L2 regularization) # species intercepts gamma = Normal(loc=tf.zeros([1, D]) + gamma_mean, scale=tf.ones([1, D]) * gamma_scale, name='gamma') # regression coefficents distribution beta = Normal(loc=tf.zeros([p, D]) + beta_mean, scale=tf.ones([p, D]) * beta_scale, name='B') total_gamma = tf.reduce_sum(gamma.log_prob(qgamma)) total_beta = tf.reduce_sum(beta.log_prob(qbeta)) log_loss = - (total_gamma + total_beta + \ total_poisson) # save parameters to model self.qbeta = qbeta self.qgamma = qgamma return log_loss