def lstm_layer(hidden_inpt, hidden_to_hidden, ingate_peephole, outgate_peephole, forgetgate_peephole, f): n_hidden_out = hidden_to_hidden.shape[0] def lstm_step(x_t, s_tm1, h_tm1): x_t += T.dot(h_tm1, hidden_to_hidden) inpt = T.tanh(x_t[:, :n_hidden_out]) gates = x_t[:, n_hidden_out:] inpeep = s_tm1 * ingate_peephole outpeep = s_tm1 * outgate_peephole forgetpeep = s_tm1 * forgetgate_peephole ingate = f(gates[:, :n_hidden_out] + inpeep) forgetgate = f( gates[:, n_hidden_out:2 * n_hidden_out] + forgetpeep) outgate = f(gates[:, 2 * n_hidden_out:] + outpeep) s_t = inpt * ingate + s_tm1 * forgetgate h_t = f(s_t) * outgate return [s_t, h_t] (states, hidden_rec), _ = theano.scan( lstm_step, sequences=hidden_inpt, outputs_info=[T.zeros_like(hidden_inpt[0, :, 0:n_hidden_out]), T.zeros_like(hidden_inpt[0, :, 0:n_hidden_out]) ]) return states, hidden_rec
def grad(self, inputs, gradients): M, e = inputs E, f = self(M, e) bM = tt.zeros_like(M) be = tt.zeros_like(M) ecosE = e * tt.cos(E) if not isinstance(gradients[0].type, theano.gradient.DisconnectedType): # Backpropagate E_bar bM = gradients[0] / (1 - ecosE) be = tt.sin(E) * bM if not isinstance(gradients[1].type, theano.gradient.DisconnectedType): # Backpropagate f_bar sinf2 = tt.sin(0.5*f) cosf2 = tt.cos(0.5*f) tanf2 = sinf2 / cosf2 e2 = e**2 ome2 = 1 - e2 ome = 1 - e ope = 1 + e cosf22 = cosf2**2 twoecosf22 = 2 * e * cosf22 factor = tt.sqrt(ope/ome) inner = (twoecosf22+ome) * tt.as_tensor_variable(gradients[1]) bM += factor*(ome*tanf2**2+ope)*inner*cosf22/(ope*ome2) be += -2*cosf22*tanf2/ome2**2*inner*(ecosE-2+e2) return [bM, be]
def get_output(self,y,y_mask,init_state,train=False): X=self.get_input(train) X_mask=self.previous.x_mask X = X.dimshuffle((1, 0, 2)) X_mask = X_mask.dimshuffle((1, 0)) y=y.dimshuffle((1, 0, 2)) y_mask=y_mask.dimshuffle((1, 0)) ### shift 1 sequence backward y_shifted=T.zeros_like(y) y_shifted=T.set_subtensor(y_shifted[1:],y[:-1]) y=y_shifted ### shift 1 sequence backward y_shifted=T.zeros_like(y_mask) y_shifted=T.set_subtensor(y_shifted[1:],y_mask[:-1]) y_mask=y_shifted y_z = T.dot(y, self.W_z) + self.b_z y_r = T.dot(y, self.W_r) + self.b_r y_h = T.dot(y, self.W_h) + self.b_h [h,logit], _ = theano.scan(self._step, sequences = [y,y_z,y_r,y_h,y_mask], outputs_info = [init_state, None], non_sequences=[X,X_mask]) return logit.dimshuffle((1, 0, 2))
def _construct_compute_ll_bound(self): """ Construct a function for computing the variational likelihood bound. """ # setup some symbolic variables for theano to deal with Xd = T.matrix() Xc = T.zeros_like(Xd) Xm = T.zeros_like(Xd) # get symbolic var for posterior KLds post_kld = self.IN.kld_cost # get symbolic var for log likelihoods if self.use_encoder: log_likelihood = self.GN.compute_log_prob(self.IN.Xd_encoded) else: log_likelihood = self.GN.compute_log_prob(self.IN.Xd) # construct a theano function for actually computing stuff outputs = [post_kld, log_likelihood] out_func = theano.function([Xd], outputs=outputs, \ givens={ self.Xd: Xd, self.Xc: Xc, self.Xm: Xm }) # construct a function for computing multi-sample averages def multi_sample_bound(X, sample_count=10): post_klds = np.zeros((X.shape[0], 1)) log_likelihoods = np.zeros((X.shape[0], 1)) max_lls = np.zeros((X.shape[0], 1)) - 1e8 for i in range(sample_count): result = out_func(X) post_klds = post_klds + (1.0 * result[0]) log_likelihoods = log_likelihoods + (1.0 * result[1]) max_lls = np.maximum(max_lls, (1.0 * result[1])) post_klds = post_klds / sample_count log_likelihoods = log_likelihoods / sample_count ll_bounds = log_likelihoods - post_klds return [ll_bounds, post_klds, log_likelihoods, max_lls] return multi_sample_bound
def _construct_sample_from_prior(self): """ Construct a function for drawing independent samples from the distribution generated by this MultiStageModel. This function returns the full sequence of "partially completed" examples. """ z_sym = T.matrix() x_sym = T.matrix() irs = self.ir_steps oputs = [self.obs_transform(self.s0)] oputs.extend([self.obs_transform(self.si[i]) for i in range(irs)]) _, hi_zmuv = self._construct_zmuv_samples(x_sym, 1) sample_func = theano.function(inputs=[z_sym, x_sym], outputs=oputs, \ givens={ self.z: z_sym, \ self.x_in: T.zeros_like(x_sym), \ self.x_out: T.zeros_like(x_sym), \ self.hi_zmuv: hi_zmuv }, \ updates=self.scan_updates) def prior_sampler(samp_count): x_samps = to_fX( np.zeros((samp_count, self.obs_dim)) ) old_switch = self.train_switch.get_value(borrow=False) # set model to generation mode self.set_train_switch(switch_val=0.0) z_samps = to_fX( npr.randn(samp_count, self.z_dim) ) model_samps = sample_func(z_samps, x_samps) # set model back to either training or generation mode self.set_train_switch(switch_val=old_switch) return model_samps return prior_sampler
def rnade_sym(self,x,W,V_alpha,b_alpha,V_mu,b_mu,V_sigma,b_sigma,activation_rescaling): """ x is a matrix of column datapoints (VxB) V = n_visible, B = batch size """ def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma,activation_factor, p_prev, a_prev, x_prev,): a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1)) h = self.nonlinearity(a * activation_factor) # BxH #x = theano.printing.Print('x')(x) Alpha = T.nnet.softmax(T.dot(h, V_alpha) + T.shape_padleft(b_alpha)) # BxC Alpha = theano.printing.Print('Alphas')(Alpha) Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) # BxC Mu = theano.printing.Print('Mu')(Mu) Sigma = T.exp((T.dot(h, V_sigma) + T.shape_padleft(b_sigma))) # BxC Sigma = theano.printing.Print('Sigmas')(Sigma) arg = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x, 1)) / Sigma) - T.log(Sigma) - constantX(0.5 * numpy.log(2 * numpy.pi)) + T.log(Alpha) arg = theano.printing.Print('printing argument of logsumexp')(arg) p_var = log_sum_exp(arg) p_var = theano.printing.Print('p_var')(p_var) p = p_prev + p_var #p = theano.printing.Print('p')(p) return (p, a, x) # First element is different (it is predicted from the bias only) a0 = T.zeros_like(T.dot(x.T, W)) # BxH p0 = T.zeros_like(x[0]) x0 = T.ones_like(x[0]) ([ps, _as, _xs], updates) = theano.scan(density_given_previous_a_and_x, sequences=[x, W, V_alpha, b_alpha,V_mu,b_mu,V_sigma,b_sigma,activation_rescaling], outputs_info=[p0, a0, x0]) return (ps[-1], updates)
def test_gpujoin_gpualloc(): a = T.fmatrix('a') a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32') b = T.fmatrix('b') b_val = numpy.asarray(numpy.random.rand(3, 5), dtype='float32') f = theano.function([a, b], T.join(0, T.zeros_like(a),T.ones_like(b)) + 4, mode=mode_without_gpu) f_gpu = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)), mode=mode_with_gpu) f_gpu2 = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)) + 4, mode=mode_with_gpu) assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 2 assert sum([node.op == T.join for node in f.maker.env.toposort()]) == 1 assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()]) == 2 assert sum([node.op == B.gpu_join for node in f_gpu.maker.env.toposort()]) == 1 assert sum([node.op == B.gpu_alloc for node in f_gpu2.maker.env.toposort()]) == 2 assert sum([node.op == B.gpu_join for node in f_gpu2.maker.env.toposort()]) == 1 assert numpy.allclose(f(a_val, b_val), f_gpu2(a_val, b_val))
def create_cost_fun (self): # create a cost function that # takes each prediction at every timestep # and guesses next timestep's value: what_to_predict = self.input_mat[:, 1:] # because some sentences are shorter, we # place masks where the sentences end: # (for how long is zero indexed, e.g. an example going from `[2,3)`) # has this value set 0 (here we substract by 1): for_how_long = self.for_how_long - 1 # all sentences start at T=0: starting_when = T.zeros_like(self.for_how_long) self.lstm_cost = masked_loss(self.lstm_predictions, what_to_predict, for_how_long, starting_when).sum() zero_entropy = T.zeros_like(self.entropy) real_entropy = T.switch(self.mask_matrix,self.entropy,zero_entropy) zero_key_entropy = T.zeros_like(self.key_entropy) real_key_entropy = T.switch(self.mask_matrix,self.key_entropy,zero_key_entropy) self.final_cost = masked_loss(self.final_predictions, what_to_predict, for_how_long, starting_when).sum()+self.entropy_reg*real_entropy.sum()+self.key_entropy_reg*real_key_entropy.sum()
def mf(self, V, Y = None, return_history = False, niter = None, block_grad = None): drop_mask = T.zeros_like(V) if Y is not None: drop_mask_Y = T.zeros_like(Y) else: batch_size = V.shape[0] num_classes = self.dbm.hidden_layers[-1].n_classes assert isinstance(num_classes, int) Y = T.alloc(1., V.shape[0], num_classes) drop_mask_Y = T.alloc(1., V.shape[0]) history = self.do_inpainting(X=V, Y=Y, return_history=True, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y, noise=False, niter=niter, block_grad=block_grad) if return_history: return [elem['H_hat'] for elem in history] return history[-1]['H_hat']
def T_subspacel1_slow_shrinkage(a,L,lam_sparse,lam_slow,small_value=.001): amp = T.sqrt(a[::2,:]**2 + a[1::2,:]**2 + small_value) #damp = amp[:,1:] - amp[:,:-1] # compose slow shrinkage with subspace l1 shrinkage # slow shrinkage div = T.zeros_like(amp) d1 = amp[:,1:] - amp[:,:-1] d2 = d1[:,1:] - d1[:,:-1] div = T.set_subtensor(div[:,1:-1],-d2) div = T.set_subtensor(div[:,0], -d1[:,0]) div = T.set_subtensor(div[:,-1], d1[:,-1]) slow_amp_shrinkage = 1 - (lam_slow/L)*(div/amp) slow_amp_value = T.switch(T.gt(slow_amp_shrinkage,0),slow_amp_shrinkage,0) slow_shrinkage_prox_a = slow_amp_value*a[::2,:] slow_shrinkage_prox_b = slow_amp_value*a[1::2,:] # subspace l1 shrinkage amp_slow_shrinkage_prox = T.sqrt(slow_shrinkage_prox_a**2 + slow_shrinkage_prox_b**2) #amp_shrinkage = 1. - (lam_slow*lam_sparse/L)*amp_slow_shrinkage_prox amp_shrinkage = 1. - (lam_sparse/L)/amp_slow_shrinkage_prox amp_value = T.switch(T.gt(amp_shrinkage,0.),amp_shrinkage,0.) subspacel1_prox = T.zeros_like(a) subspacel1_prox = T.set_subtensor(subspacel1_prox[ ::2,:],amp_value*slow_shrinkage_prox_a) subspacel1_prox = T.set_subtensor(subspacel1_prox[1::2,:],amp_value*slow_shrinkage_prox_b) return subspacel1_prox
def filter_and_prob(inpt, transition, emission, visible_noise_mean, visible_noise_cov, hidden_noise_mean, hidden_noise_cov, initial_hidden, initial_hidden_cov): step = forward_step( transition, emission, visible_noise_mean, visible_noise_cov, hidden_noise_mean, hidden_noise_cov) hidden_mean_0 = T.zeros_like(hidden_noise_mean).dimshuffle('x', 0) hidden_cov_0 = T.zeros_like(hidden_noise_cov).dimshuffle('x', 0, 1) f0, F0, ll0 = step(inpt[0], hidden_mean_0, hidden_cov_0) replace = {hidden_noise_mean: initial_hidden, hidden_noise_cov: initial_hidden_cov} f0 = theano.clone(f0, replace) F0 = theano.clone(F0, replace) ll0 = theano.clone(ll0, replace) (f, F, ll), _ = theano.scan( step, sequences=inpt[1:], outputs_info=[f0, F0, None]) ll = ll.sum(axis=0) f = T.concatenate([T.shape_padleft(f0), f]) F = T.concatenate([T.shape_padleft(F0), F]) ll += ll0 return f, F, ll
def recur(self, ms_j, mt_jm1, mscut_j, mtcut_jm1, ssrcpos_js, vsrcpos_js, starpos_js, vtarpos_js ): # cnn encoding ngms_j, uttms_j = self.sCNN.encode(ms_j, mscut_j) ngmt_jm1,uttmt_jm1 = self.tCNN.encode(mt_jm1,mtcut_jm1) # padding dummy vector ngms_j = T.concatenate([ngms_j,T.zeros_like(ngms_j[-1:,:])],axis=0) ngmt_jm1 = T.concatenate([ngmt_jm1,T.zeros_like(ngmt_jm1[-1:,:])],axis=0) # source features ssrcemb_js = T.sum(ngms_j[ssrcpos_js,:],axis=0) vsrcemb_js = T.sum(ngms_j[vsrcpos_js,:],axis=0) src_js = T.concatenate([ssrcemb_js,vsrcemb_js,uttms_j],axis=0) # target features staremb_js = T.sum(ngmt_jm1[starpos_js,:],axis=0) vtaremb_js = T.sum(ngmt_jm1[vtarpos_js,:],axis=0) tar_js = T.concatenate([staremb_js,vtaremb_js,uttmt_jm1],axis=0) # update g_j g_j = T.dot( self.Whb, T.nnet.sigmoid( T.dot(src_js,self.Wfbs) + T.dot(tar_js,self.Wfbt) + self.B0)).dimshuffle('x') # update b_j g_j = T.concatenate([g_j,self.B],axis=0) b_j = T.nnet.softmax( g_j )[0,:] return b_j
def T_subspacel1_slow_shrinkage_conv(a, L, lam_sparse, lam_slow, imshp,kshp,featshp,stride=(1,1),small_value=.001): featshp = (imshp[0],kshp[0],featshp[2],featshp[3]) # num images, features, szy, szx features = T.reshape(T.transpose(a),featshp,ndim=4) amp = T.sqrt(features[:,::2,:,:]**2 + features[:,1::2,:,:]**2 + small_value) #damp = amp[:,1:] - amp[:,:-1] # compose slow shrinkage with subspace l1 shrinkage # slow shrinkage div = T.zeros_like(amp) d1 = amp[1:,:,:,:] - amp[:-1,:,:,:] d2 = d1[1:,:,:,:] - d1[:-1,:,:,:] div = T.set_subtensor(div[1:-1,:,:,:], -d2) div = T.set_subtensor(div[0,:,:,:], -d1[0,:,:,:]) div = T.set_subtensor(div[-1,:,:,:], d1[-1,:,:,:]) slow_amp_shrinkage = 1 - (lam_slow / L) * (div / amp) slow_amp_value = T.switch(T.gt(slow_amp_shrinkage, 0), slow_amp_shrinkage, 0) slow_shrinkage_prox_a = slow_amp_value * features[:, ::2, :,:] slow_shrinkage_prox_b = slow_amp_value * features[:,1::2, :,:] # subspace l1 shrinkage amp_slow_shrinkage_prox = T.sqrt(slow_shrinkage_prox_a ** 2 + slow_shrinkage_prox_b ** 2) #amp_shrinkage = 1. - (lam_slow*lam_sparse/L)*amp_slow_shrinkage_prox amp_shrinkage = 1. - (lam_sparse / L) / amp_slow_shrinkage_prox amp_value = T.switch(T.gt(amp_shrinkage, 0.), amp_shrinkage, 0.) subspacel1_prox = T.zeros_like(features) subspacel1_prox = T.set_subtensor(subspacel1_prox[:, ::2, :,:], amp_value * slow_shrinkage_prox_a) subspacel1_prox = T.set_subtensor(subspacel1_prox[:,1::2, :,:], amp_value * slow_shrinkage_prox_b) reshape_subspacel1_prox = T.transpose(T.reshape(subspacel1_prox,(featshp[0],featshp[1]*featshp[2]*featshp[3]),ndim=2)) return reshape_subspacel1_prox
def get_aggregator(self): initialized = shared_like(0.) numerator_acc = shared_like(self.numerator) denominator_acc = shared_like(self.denominator) # Dummy default expression to use as the previously-aggregated # value, that has the same shape as the new result numerator_zeros = tensor.as_tensor(self.numerator).zeros_like() denominator_zeros = tensor.as_tensor(self.denominator).zeros_like() conditional_update_num = self.numerator + ifelse(initialized, numerator_acc, numerator_zeros) conditional_update_den = self.denominator + ifelse(initialized, denominator_acc, denominator_zeros) initialization_updates = [(numerator_acc, tensor.zeros_like(numerator_acc)), (denominator_acc, tensor.zeros_like(denominator_acc)), (initialized, 0.)] accumulation_updates = [(numerator_acc, conditional_update_num), (denominator_acc, conditional_update_den), (initialized, 1.)] aggregator = Aggregator(aggregation_scheme=self, initialization_updates=initialization_updates, accumulation_updates=accumulation_updates, readout_variable=(numerator_acc / denominator_acc)) return aggregator
def castray(ro, rd, shape_params, nprims, width, height): tmin = 1.0 tmax = 20.0 precis = 0.002 m = -1.0 # There are a sequence of distances, d1, d2, ..., dn # then theres the accumulated distances d1, d1+d2, d1+d2+d3.... # What we actually want in the output is the sfor each ray the distance to the surface # So we want something like 0, 20, 25, 27, 28, 28, 28, 28, 28 # OK max_num_steps = 25 # distcolors = map(ro + rd * 0, width, height) #FIXME, reshape instead of mul by 0 distcolors = mapedit(ro + rd * 0, shape_params, nprims, width, height) dists = distcolors steps = T.switch(dists < precis, T.zeros_like(dists), T.ones_like(dists)) accum_dists = T.reshape(dists, (width, height, 1)) for i in range(max_num_steps - 1): # distcolors = map(ro + rd * accum_dists, width, height) #FIXME, reshape instead of mul by 0 distcolors = mapedit(ro + rd * accum_dists, shape_params, nprims, width, height) #FIXME, reshape instead of mul by 0 dists = distcolors steps = steps + T.switch(dists < precis, T.zeros_like(dists), T.ones_like(dists)) accum_dists = accum_dists + T.reshape(dists, (width, height, 1)) last_depth = T.reshape(accum_dists, (width, height)) depthmap = T.switch(last_depth < tmax, last_depth / tmax, T.zeros_like(last_depth)) color = 1.0 - steps / float(max_num_steps) # Distance marched along ray and delta between last two steps return depthmap
def get_celerite_matrices(self, x, diag): x = tt.as_tensor_variable(x) diag = tt.as_tensor_variable(diag) ar, cr, ac, bc, cc, dc = self.coefficients a = diag + tt.sum(ar) + tt.sum(ac) U = tt.concatenate(( ar[None, :] + tt.zeros_like(x)[:, None], ac[None, :] * tt.cos(dc[None, :] * x[:, None]) + bc[None, :] * tt.sin(dc[None, :] * x[:, None]), ac[None, :] * tt.sin(dc[None, :] * x[:, None]) - bc[None, :] * tt.cos(dc[None, :] * x[:, None]), ), axis=1) V = tt.concatenate(( tt.zeros_like(ar)[None, :] + tt.ones_like(x)[:, None], tt.cos(dc[None, :] * x[:, None]), tt.sin(dc[None, :] * x[:, None]), ), axis=1) dx = x[1:] - x[:-1] P = tt.concatenate(( tt.exp(-cr[None, :] * dx[:, None]), tt.exp(-cc[None, :] * dx[:, None]), tt.exp(-cc[None, :] * dx[:, None]), ), axis=1) return a, U, V, P
def __call__(self, input_, *xs): ''' Maybe unclear: input_ is the variable to be scaled, xs are the actual inputs. ''' updates = theano.OrderedUpdates() if len(xs) != len(self.dims_in): raise ValueError('Number of (external) inputs for baseline must' ' match parameters') ws = [] for i in xrange(len(xs)): # Maybe not the most pythonic way... ws.append(self.__dict__['w%d' % i]) ids = T.sum([x.dot(W) for x, W in zip(xs, ws)], axis=0).T ids_c = T.zeros_like(ids) + ids input_scaled = input_ / ids_c input_ = T.zeros_like(input_) + input_ outs = OrderedDict( x_c=input_, x_scaled=input_scaled, ids=ids, ids_c=ids_c ) return outs, updates
def get_output_for(self,net_input,**kwargs): if 'unary' in kwargs and kwargs['unary']==True: return net_input logger.info('Initializing the messages') Wp=self.W unary_sequence = net_input.dimshuffle(1,0,2) #Reshuffling the batched unary potential shape so that it can be used for word level iterations in theano.scan def forward_scan1(unary_sequence,forward_sm,Wp): forward_sm=forward_sm+unary_sequence forward_sm=theano_logsumexp(forward_sm.dimshuffle(0,1,'x')+Wp,1) return forward_sm def backward_scan1(unary_sequence,forward_sm,Wp): forward_sm=forward_sm+unary_sequence forward_sm=theano_logsumexp(forward_sm.dimshuffle(0,1,'x')+Wp.T,1) return forward_sm forward_results,_=theano.scan(fn=forward_scan1,sequences=[unary_sequence],outputs_info=T.zeros_like(unary_sequence[0]),non_sequences=[Wp],n_steps=unary_sequence.shape[0]-1) backward_results,_=theano.scan(fn=backward_scan1,sequences=[unary_sequence[::-1]],outputs_info=T.zeros_like(unary_sequence[0]),non_sequences=[Wp],n_steps=unary_sequence.shape[0]-1) backward_results=T.concatenate([backward_results[::-1],T.zeros_like(backward_results[:1])],axis=0) forward_results=T.concatenate([T.zeros_like(forward_results[:1]),forward_results],axis=0) unnormalized_prob = forward_results+unary_sequence+backward_results marginal_results = theano_logsumexp(unnormalized_prob,axis=2) normalized_prob = unnormalized_prob - marginal_results.dimshuffle(0,1,'x') # provided for debugging purposes. #marginal_all = theano.function([l_in.input_var,l_mask.input_var],marginal_results) #probs=theano.function([l_in.input_var,l_mask.input_var],normalized_prob.dimshuffle(1,0,2)) if 'normalized' in kwargs and kwargs['normalized']==True: return normalized_prob.dimshuffle(1,0,2) else: return unnormalized_prob.dimshuffle(1,0,2)
def get_aggregator(self): initialized = shared_like(0.) total_acc = shared_like(self.variable) total_zeros = tensor.as_tensor(self.variable).zeros_like() conditional_update_num = self.variable + ifelse(initialized, total_acc, total_zeros) initialization_updates = [(total_acc, tensor.zeros_like(total_acc)), (initialized, tensor.zeros_like(initialized))] accumulation_updates = [(total_acc, conditional_update_num), (initialized, tensor.ones_like(initialized))] aggregator = Aggregator(aggregation_scheme=self, initialization_updates=initialization_updates, accumulation_updates=accumulation_updates, readout_variable=(total_acc)) return aggregator
def grad(self, inputs, out_grads): batch_mean, rolling_mean, rolling_grad, alpha = inputs out_grad, = out_grads if self.update_averages: assert treeano.utils.is_shared_variable(rolling_mean) assert treeano.utils.is_shared_variable(rolling_grad) # HACK this is super hacky and won't work for certain # computation graphs # TODO make assertion again if (hasattr(rolling_mean, "default_update") or hasattr(rolling_grad, "default_update")): warnings.warn("rolling mean/grad already has updates - " "overwritting. this can be caused by calculating " "the gradient of backprop to the future mean " "multiple times") rolling_mean.default_update = (alpha * rolling_mean + (1 - alpha) * batch_mean) rolling_grad.default_update = (alpha * rolling_grad + (1 - alpha) * out_grad) else: # HACK remove default_update if hasattr(rolling_mean, "default_update"): delattr(rolling_mean, "default_update") if hasattr(rolling_grad, "default_update"): delattr(rolling_grad, "default_update") return [rolling_grad, T.zeros_like(rolling_mean), T.zeros_like(rolling_grad), T.zeros_like(alpha)]
def generic_compute_Lx_batches(samples, weights, biases, bs, cbs): tsamples = [x.reshape((bs//cbs, cbs, x.shape[1])) for x in samples] final_ws = [T.unbroadcast(T.shape_padleft(T.zeros_like(x)),0) for x in weights] final_bs = [T.unbroadcast(T.shape_padleft(T.zeros_like(x)),0) for x in biases] n_samples = len(samples) n_weights = len(weights) n_biases = len(biases) def comp_step(*args): lsamples = args[:n_samples] terms1 = generic_compute_Lx_term1(lsamples, weights, biases) rval = [] for (term1, acc) in zip(terms1, args[n_samples:]): rval += [acc + term1] return rval rvals,_ = theano.sandbox.scan.scan( comp_step, sequences=tsamples, states=final_ws + final_bs, n_steps=bs // cbs, profile=0, mode=theano.Mode(linker='cvm_nogc'), flags=['no_optimization'] ) accs1 = [x[0]/numpy.float32(bs//cbs) for x in rvals] accs2 = generic_compute_Lx_term2(samples,weights,biases) return [x - y for x, y in zip(accs1, accs2)]
def compute_Lx_batches(v, g, h, xw_mat, xv_mat, xa, xb, xc, bs, cbs): xw = xw_mat.flatten() xv = xv_mat.flatten() tv = v.reshape((bs // cbs, cbs, v.shape[1])) tg = g.reshape((bs // cbs, cbs, g.shape[1])) th = h.reshape((bs // cbs, cbs, h.shape[1])) final_w1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xw_mat)),0) final_v1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xv_mat)),0) final_a1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xa)),0) final_b1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xb)),0) final_c1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xc)),0) def comp_step(lv, lg, lh, acc_w1, acc_v1, acc_a1, acc_b1, acc_c1): terms1 = compute_Lx_term1(lv, lg, lh, xw, xv, xa, xb, xc) accs1 = [acc_w1, acc_v1, acc_a1, acc_b1, acc_c1] rval = [] for (term1, acc) in zip(terms1,accs1): rval += [acc + term1] return rval rvals,_ = theano.sandbox.scan.scan( comp_step, sequences=[tv,tg,th], states=[ final_w1, final_v1, final_a1, final_b1, final_c1], n_steps=bs // cbs, profile=0, mode=theano.Mode(linker='cvm_nogc'), flags=['no_optimization'] ) accs1 = [x[0]/numpy.float32(bs//cbs) for x in rvals] accs2 = compute_Lx_term2(v,g,h,xw,xv,xa,xb,xc) return [x - y for x, y in zip(accs1, accs2)]
def reconstruct(self, x, n_samples) : mu, log_sigma = self.encoder(x) if n_samples <= 0 : y = self.decoder(mu) else : #sample from posterior if self.continuous : #hack to find out size of variables (y_mu, y_log_sigma) = self.decoder(mu) (y_mu, y_log_sigma) = (T.zeros_like(y_mu), T.zeros_like(y_log_sigma)) else : y = T.zeros(x.shape) for i in range(n_samples) : z = reparam_trick(mu, log_sigma, self.srng) if self.continuous : (new_y_mu, new_y_log_sigma) = self.decoder(z) y_mu = y_mu + new_y_mu y_log_sigma = y_log_sigma + new_y_log_sigma else : y = y + self.decoder(z) if self.continuous : y_mu = y_mu / n_samples y_log_sigma = y_log_sigma / n_samples y = (y_mu, y_log_sigma) else : y = (y / n_samples) if self.continuous : (y_mu, y_log_sigma) = y I = T.eye(y_mu.shape[0]) cov = (T.pow(T.exp(y_log_sigma), 2)) * I y = np.random.multivariate_normal(y_mu.eval(), cov.eval()) else : y = y.eval() return y
def get_aggregator(self): initialized = shared_like(0.) numerator_acc = shared_like(self.numerator) denominator_acc = shared_like(self.denominator) conditional_update_num = ifelse(initialized, self.numerator + numerator_acc, self.numerator) conditional_update_den = ifelse(initialized, self.denominator + denominator_acc, self.denominator) initialization_updates = [(numerator_acc, tensor.zeros_like(numerator_acc)), (denominator_acc, tensor.zeros_like(denominator_acc)), (initialized, 0.)] accumulation_updates = [(numerator_acc, conditional_update_num), (denominator_acc, conditional_update_den), (initialized, 1.)] aggregator = Aggregator(aggregation_scheme=self, initialization_updates=initialization_updates, accumulation_updates=accumulation_updates, readout_variable=(numerator_acc / denominator_acc)) return aggregator
def compute_cost_log_in_parallel(original_rnn_outputs, labels, func, x_ends, y_ends): mask = T.log(1 - T.or_(T.eq(labels, T.zeros_like(labels)), T.eq(labels, shift_matrix(labels, 2)))) initial_state = T.log(T.zeros_like(labels)) initial_state = T.set_subtensor(initial_state[:,0], 0) def select_probabilities(rnn_outputs, label): return rnn_outputs[:,label] rnn_outputs, _ = theano.map(select_probabilities, [original_rnn_outputs, labels]) rnn_outputs = T.log(rnn_outputs.dimshuffle((1,0,2))) def forward_step(probabilities, last_probabilities): all_forward_probabilities = T.stack( last_probabilities + probabilities, log_shift_matrix(last_probabilities, 1) + probabilities, log_shift_matrix(last_probabilities, 2) + probabilities + mask, ) result = func(all_forward_probabilities, 0) return result forward_probabilities, _ = theano.scan(fn = forward_step, sequences = rnn_outputs, outputs_info = initial_state) forward_probabilities = forward_probabilities.dimshuffle((1,0,2)) def compute_cost(forward_probabilities, x_end, y_end): return -func(forward_probabilities[x_end-1,y_end-2:y_end]) return theano.map(compute_cost, [forward_probabilities, x_ends, y_ends])[0]
def lstm(mask, state_in, t_params, n_dim_in, n_dim_out, prefix, one_step=False, init_h=None): ''' Long Short-Term Memory (LSTM) layer ''' def _step(_mask, _state_in, _prev_h, _prev_c): _pre_act = tensor.dot(_prev_h, t_params[_concat(prefix, 'U')]) + _state_in _gate_i = tensor.nnet.sigmoid(_slice(_pre_act, 0, n_dim_out)) _gate_f = tensor.nnet.sigmoid(_slice(_pre_act, 1, n_dim_out)) _gate_o = tensor.nnet.sigmoid(_slice(_pre_act, 2, n_dim_out)) _next_c = _gate_f * _prev_c + _gate_i * tensor.tanh(_slice(_pre_act, 3, n_dim_out)) _next_c = _mask[:, None] * _next_c + (1. - _mask)[:, None] * _prev_c _next_h = _gate_o * tensor.tanh(_next_c) _next_h = _mask[:, None] * _next_h + (1. - _mask)[:, None] * _prev_h return _next_h, _next_c params = OrderedDict() params[_concat(prefix, 'W')] = numpy.concatenate([ortho_weight(n_dim_in, n_dim_out), ortho_weight(n_dim_in, n_dim_out), ortho_weight(n_dim_in, n_dim_out), ortho_weight(n_dim_in, n_dim_out)], 1) params[_concat(prefix, 'U')] = numpy.concatenate([ortho_weight(n_dim_out, n_dim_out), ortho_weight(n_dim_out, n_dim_out), ortho_weight(n_dim_out, n_dim_out), ortho_weight(n_dim_out, n_dim_out)], 1) params[_concat(prefix, 'b')] = numpy.zeros((4 * n_dim_out,), config.floatX) init_t_params(params, t_params) state_in = (tensor.dot(state_in, t_params[_concat(prefix, 'W')]) + t_params[_concat(prefix, 'b')]) if init_h is None: init_h = tensor.alloc(to_floatX(0.), state_in.shape[-2], n_dim_out) if one_step: state_out, _ = _step(mask, state_in, init_h, tensor.zeros_like(init_h)) return state_out else: [state_out, _], _ = theano.scan(_step, [mask, state_in], [init_h, tensor.zeros_like(init_h)]) return state_out
def _construct_compute_fe_terms(self): """ Construct theano function to compute the log-likelihood and posterior KL-divergence terms for the variational free-energy. """ # setup some symbolic variables for theano to deal with Xd = T.matrix() Xc = T.zeros_like(Xd) Xm = T.zeros_like(Xd) # construct values to output if self.x_type == 'bernoulli': ll_term = log_prob_bernoulli(self.x, self.xg) else: ll_term = log_prob_gaussian2(self.x, self.xg, \ log_vars=self.bounded_logvar) all_klds = gaussian_kld(self.q_z_given_x.output_mean, \ self.q_z_given_x.output_logvar, \ self.prior_mean, self.prior_logvar) kld_term = T.sum(all_klds, axis=1) # compile theano function for a one-sample free-energy estimate fe_term_sample = theano.function(inputs=[Xd], \ outputs=[ll_term, kld_term], \ givens={self.Xd: Xd, self.Xc: Xc, self.Xm: Xm}) # construct a wrapper function for multi-sample free-energy estimate def fe_term_estimator(X, sample_count): ll_sum = np.zeros((X.shape[0],)) kld_sum = np.zeros((X.shape[0],)) for i in range(sample_count): result = fe_term_sample(X) ll_sum = ll_sum + result[0].ravel() kld_sum = kld_sum + result[1].ravel() mean_nll = -ll_sum / float(sample_count) mean_kld = kld_sum / float(sample_count) return [mean_nll, mean_kld] return fe_term_estimator
def grad(self, inputs, output_grads): Z_f, Z_b, V_f, V_b, c_f, c_b, i_f, i_b = inputs DY_f, DY_b, DH_f, DH_b, Dd_f, Dd_b = output_grads Z_f_raw = Z_f.owner.inputs[0].owner.inputs[0] Z_b_raw = Z_b.owner.inputs[0].owner.inputs[0] #TODO!!! V_f_raw = V_f.owner.inputs[0] V_b_raw = V_b.owner.inputs[0] c_f_raw = c_f.owner.inputs[0].owner.inputs[0] c_b_raw = c_b.owner.inputs[0].owner.inputs[0] i_f_raw = i_f.owner.inputs[0].owner.inputs[0] i_b_raw = i_b.owner.inputs[0].owner.inputs[0] #we have to make sure that this in only computed once! #for this we have to extract the raw variables before conversion to continuous gpu array #so that theano can merge the nodes Y_f, Y_b, H_f, H_b, d_f, d_b = BLSTMOpInstance(Z_f_raw, Z_b_raw, V_f_raw, V_b_raw, c_f_raw, c_b_raw, i_f_raw, i_b_raw) if isinstance(DY_f.type, theano.gradient.DisconnectedType): DY_f = T.zeros_like(Z_f) if isinstance(DY_b.type, theano.gradient.DisconnectedType): DY_b = T.zeros_like(Z_b) if isinstance(Dd_f.type, theano.gradient.DisconnectedType): Dd_f = T.zeros_like(c_f) if isinstance(Dd_b.type, theano.gradient.DisconnectedType): Dd_b = T.zeros_like(c_b) DZ_f, DZ_b, DV_f, DV_b, Dc_f, Dc_b = BLSTMOpGradNoInplaceInstance(V_f, V_b, c_f, c_b, i_f, i_b, Dd_f, Dd_b, DY_f, DY_b, Y_f, Y_b, H_f, H_b) Di_f = theano.gradient.grad_undefined(self, 5, inputs[5], 'cannot diff w.r.t. index') Di_b = theano.gradient.grad_undefined(self, 6, inputs[6], 'cannot diff w.r.t. index') return [DZ_f, DZ_b, DV_f, DV_b, Dc_f, Dc_b, Di_f, Di_b]
def build_gsn(self, add_noise, hiddens=None, reverse=False): p_X_chain = [] # Whether or not to corrupt the visible input X if add_noise: X_init = self.input_noise(self.X) else: X_init = self.X # if no input hiddens were provided, initialize with zeros if hiddens is None: # init hiddens with zeros hiddens = [X_init] if self.tied_weights: for w in self.weights_list: hiddens.append(T.zeros_like(T.dot(hiddens[-1], w))) else: for w in self.weights_list[:self.layers]: hiddens.append(T.zeros_like(T.dot(hiddens[-1], w))) # The layer update scheme log.info("Building the GSN graph : %s updates", str(self.walkbacks)) for i in range(self.walkbacks): log.debug("GSN Walkback %s/%s", str(i + 1), str(self.walkbacks)) self.update_layers(hiddens, p_X_chain, add_noise, reverse=reverse) return p_X_chain, hiddens
def group_div(X, W, H, beta, params): """Compute beta divergence D(X|WH), intra-class distance and intra-session distance for a particular (class, session) couple [1]_. Parameters ---------- X : Theano tensor data W : Theano tensor Bases H : Theano tensor activation matrix beta : Theano scalar params : Theano tensor Matrix of parameter related to class/session. :params[0][0]: index for the (class, session) couple :params[1][0]: number of vector basis related to class :params[1][1]: number of vector basis related to session :params[2]: weight on the class/session similarity constraints :params[3]: sessions in which class c appears :params[4]: classes present in session s Returns ------- cost : Theano scalar total cost div : Theano scalar beta divergence D(X|WH) sum_cls : Theano scalar intra-class distance sum_ses : Theano scalar intra-session distance""" ind = params[0][0] k_cls = params[1][0] k_ses = params[1][1] lambdas = params[2] Sc = params[3] Cs = params[4] res_ses, up = theano.scan( fn=lambda Cs, prior_result: prior_result + eucl_dist(W[ind, :, k_cls : k_cls + k_ses], W[Cs, :, k_cls : k_cls + k_ses]), outputs_info=T.zeros_like(beta), sequences=Cs, ) sum_ses = ifelse(T.gt(Cs[0], 0), res_ses[-1], T.zeros_like(beta)) res_cls, up = theano.scan( fn=lambda Sc, prior_result: prior_result + eucl_dist(W[ind, :, 0:k_cls], W[Sc, :, 0:k_cls]), outputs_info=T.zeros_like(beta), sequences=Sc, ) sum_cls = ifelse(T.gt(Sc[0], 0), res_cls[-1], T.zeros_like(beta)) betaDiv = beta_div(X, W[ind].T, H, beta) cost = lambdas[0] * sum_cls + lambdas[1] * sum_ses + betaDiv return cost, betaDiv, sum_cls, sum_ses
def sym_gradients_new(self, X): non_linearity_name = self.parameters["nonlinearity"].get_name() assert (non_linearity_name == "sigmoid" or non_linearity_name == "RLU") # First element is different (it is predicted from the bias only) init_a = T.zeros_like(T.dot(X.T, self.W)) # BxH init_x = T.ones_like(X[0]) def a_i_given_a_im1(x, w, a_prev, x_prev): a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1)) return (a, x) ([As, _], updates) = theano.scan(a_i_given_a_im1, sequences=[X, self.W], outputs_info=[init_a, init_x]) top_activations = As[-1] Xs_m1 = T.set_subtensor(X[1:, :], X[0:-1, :]) Xs_m1 = T.set_subtensor(Xs_m1[0, :], 1) # Reconstruct the previous activations and calculate (for that visible # dimension) the density and all the gradients def density_and_gradients(x_i, x_im1, w_i, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activation_factor, a_i, lp_accum, dP_da_ip1): B = T.cast(x_i.shape[0], theano.config.floatX) pot = a_i * activation_factor h = self.nonlinearity(pot) # BxH z_alpha = T.dot(h, V_alpha) + T.shape_padleft(b_alpha) z_mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) z_sigma = T.dot(h, V_sigma) + T.shape_padleft(b_sigma) Alpha = T.nnet.softmax(z_alpha) # BxC Mu = z_mu # BxC Sigma = T.exp(z_sigma) # BxC Phi = -T.log( 2 * Sigma) - T.abs_(Mu - T.shape_padright(x_i, 1)) / Sigma wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0)) lp_current = log_sum_exp(wPhi) # lp_current_sum = T.sum(lp_current) Pi = T.exp(wPhi - T.shape_padright(lp_current, 1)) # # dp_dz_alpha = Pi - Alpha # BxC # dp_dz_alpha = T.grad(lp_current_sum, z_alpha) gb_alpha = dp_dz_alpha.mean(0, dtype=theano.config.floatX) # C gV_alpha = T.dot(h.T, dp_dz_alpha) / B # HxC # dp_dz_mu = T.grad(lp_current_sum, z_mu) dp_dz_mu = Pi * T.sgn(T.shape_padright(x_i, 1) - Mu) / Sigma # dp_dz_mu = dp_dz_mu * Sigma gb_mu = dp_dz_mu.mean(0, dtype=theano.config.floatX) gV_mu = T.dot(h.T, dp_dz_mu) / B # dp_dz_sigma = T.grad(lp_current_sum, z_sigma) dp_dz_sigma = Pi * (T.abs_(T.shape_padright(x_i, 1) - Mu) / Sigma - 1) gb_sigma = dp_dz_sigma.mean(0, dtype=theano.config.floatX) gV_sigma = T.dot(h.T, dp_dz_sigma) / B dp_dh = T.dot(dp_dz_alpha, V_alpha.T) + T.dot( dp_dz_mu, V_mu.T) + T.dot(dp_dz_sigma, V_sigma.T) # BxH if non_linearity_name == "sigmoid": dp_dpot = dp_dh * h * (1 - h) elif non_linearity_name == "RLU": dp_dpot = dp_dh * (pot > 0) gfact = (dp_dpot * a_i).sum(1).mean( 0, dtype=theano.config.floatX) # 1 dP_da_i = dP_da_ip1 + dp_dpot * activation_factor # BxH gW = T.dot(T.shape_padleft(x_im1, 1), dP_da_i).flatten() / B return (a_i - T.dot(T.shape_padright(x_im1, 1), T.shape_padleft(w_i, 1)), lp_accum + lp_current, dP_da_i, gW, gb_alpha, gV_alpha, gb_mu, gV_mu, gb_sigma, gV_sigma, gfact) p_accum = T.zeros_like(X[0]) dP_da_ip1 = T.zeros_like(top_activations) ([ _, ps, _, gW, gb_alpha, gV_alpha, gb_mu, gV_mu, gb_sigma, gV_sigma, gfact ], updates2) = theano.scan(density_and_gradients, go_backwards=True, sequences=[ X, Xs_m1, self.W, self.V_alpha, self.b_alpha, self.V_mu, self.b_mu, self.V_sigma, self.b_sigma, self.activation_rescaling ], outputs_info=[ top_activations, p_accum, dP_da_ip1, None, None, None, None, None, None, None, None ]) # scan with go_backwards returns the matrices in the order they were # created, so we have to reverse the order of the rows gW = gW[::-1, :] gb_alpha = gb_alpha[::-1, :] gV_alpha = gV_alpha[::-1, :, :] gb_mu = gb_mu[::-1, :] gV_mu = gV_mu[::-1, :, :] gb_sigma = gb_sigma[::-1, :] gV_sigma = gV_sigma[::-1, :, :] gfact = gfact[::-1] updates.update(updates2) # Returns None return (ps[-1], gW, gb_alpha, gV_alpha, gb_mu, gV_mu, gb_sigma, gV_sigma, gfact, updates)
def test_kl_equivalence(self): "tests that the kl divergence for the two models is the same " """ This is a tricky task. The full KL-divergence is not tractable, but this is the quantity that's known to be the same for the two models (since the PDDBM should have 0 KL-divergence from g, since its weights are fixed to 0). The quantity we monitor inside the models is the "truncated KL divergence", the portion that depends on the variational parameters. In this case (S3C / PD-DBM with DBM weights fixed to 0) the partition function is also tractable, so we can include the terms that depend on the partition function. Fortunately this is enough of the KL divergence to guarantee that the quantity is the same for both models. There's another term that depends on P(v) which is still intractable but g has no effect on P(v) in this case since the DBM weights are fixed to 0. """ """ Let Z represent all latent vars, V all visible vars KL(Q(Z)||P(Z|v)) = \sum_z Q(z) log Q(z) / P(z | v) = \sum_z Q(z) log Q(z) - \sum_z Q(z) log P(z | v) = - H_Q(Z) - \sum_z Q(z) log P(z,v) + sum_z Q(z) log P(v) = - H_Q(Z) - \sum_z Q(z) log exp(-E(z,v))/Z + log P(v) = - H_Q(Z) - \sum_z Q(z) log exp(-E(z,v)) + \sum_z Q(z) Z + log P(v) = - H_Q(Z) + \sum_z Q(z) E(z,v) + log Z + log P(v) = - H_Q(Z) + E_{z\simQ}[E(z,v)] + log Z + log P(v) """ model = self.model ip = self.inference_procedure e_step = self.e_step X = self.X assert X.shape[0] == self.m H = np.cast[config.floatX](self.model.rng.uniform(0.,1.,(self.m, self.N))) S = np.cast[config.floatX](self.model.rng.uniform(-5.,5.,(self.m, self.N))) G = np.cast[config.floatX]( broadcast( sigmoid(self.model.dbm.rbms[0].bias_hid.get_value()), self.m)) H_var = T.matrix(name='H_var') H_var.tag.test_value = H S_var = T.matrix(name='S_var') S_var.tag.test_value = S G_var = T.matrix(name='G_var') G_var.tag.test_value = G dbm_sigma0 = ip.infer_var_s0_hat() dbm_Sigma1 = ip.infer_var_s1_hat() dbm_trunc_kl = ip.truncated_KL( V = X, obs = { 'H_hat' : H_var, 'S_hat' : S_var, 'var_s0_hat' : dbm_sigma0, 'var_s1_hat' : dbm_Sigma1, 'G_hat' : ( G_var, ) } ).mean() #just the part related to G (check that it all comes out to 0) #dbm_trunc_kl = - entropy_binary_vector( G_var ).mean() - T.dot(G_var.mean(axis=0),self.model.dbm.rbms[0].bias_hid) assert len(dbm_trunc_kl.type.broadcastable) == 0 s3c_sigma0 = e_step.infer_var_s0_hat() s3c_Sigma1 = e_step.infer_var_s1_hat() s3c_mu0 = T.zeros_like(self.s3c.mu) s3c_trunc_kl = e_step.truncated_KL( V = X, obs = { 'H_hat' : H_var, 'S_hat' : S_var, 'var_s0_hat' : s3c_sigma0, 'var_s1_hat' : s3c_Sigma1 } ) dbm_log_partition_function = self.model.s3c.log_partition_function() \ + T.nnet.softplus(self.model.dbm.rbms[0].bias_hid).sum() #just the part related to G (check that it all comes out to 0) #dbm_log_partition_function = T.nnet.softplus(self.model.dbm.rbms[0].bias_hid).sum() s3c_log_partition_function = self.s3c.log_partition_function() s3c_partial_kl = s3c_trunc_kl.mean() + s3c_log_partition_function assert len(s3c_partial_kl.type.broadcastable) == 0 dbm_partial_kl = dbm_trunc_kl + dbm_log_partition_function s3c_partial_kl, dbm_partial_kl = function([H_var,S_var,G_var], (s3c_partial_kl, dbm_partial_kl))(H,S,G) print s3c_partial_kl print dbm_partial_kl assert np.allclose(s3c_partial_kl, dbm_partial_kl)
def SEIR( lambda_t_log, pr_beta_I_begin=100, pr_beta_new_E_begin=50, pr_median_mu=1 / 8, pr_mean_median_incubation=4, pr_sigma_median_incubation=1, sigma_incubation=0.4, pr_sigma_mu=0.2, model=None, return_all=False, save_all=False, name_median_incubation="median_incubation", ): r""" Implements a model similar to the susceptible-exposed-infected-recovered model. Instead of a exponential decaying incubation period, the length of the period is lognormal distributed. The complete equation is: .. math:: E_{\text{new}}(t) &= \lambda_t I(t-1) \frac{S(t)}{N} \\ S(t) &= S(t-1) - E_{\text{new}}(t) \\ I_\text{new}(t) &= \sum_{k=1}^{10} \beta(k) E_{\text{new}}(t-k) \\ I(t) &= I(t-1) + I_{\text{new}}(t) - \mu I(t) \\ \beta(k) & = P(k) \sim LogNormal(\text{log}(d_{\text{incubation}})), \text{sigma\_incubation}) The recovery rate :math:`\mu` and the incubation period is the same for all regions and follow respectively: .. math:: P(\mu) &\sim LogNormal(\text{log(pr\_median\_mu)), pr\_sigma\_mu}) \\ P(d_{\text{incubation}}) &\sim Normal(\text{pr\_mean\_median\_incubation, pr\_sigma\_median\_incubation}) The initial number of infected and newly exposed differ for each region and follow prior :class:`~pymc3.distributions.continuous.HalfCauchy` distributions: .. math:: E(t) &\sim HalfCauchy(\text{pr\_beta\_E\_begin}) \:\: \text{ for} \: t \in \{-9, -8, ..., 0\}\\ I(0) &\sim HalfCauchy(\text{pr\_beta\_I\_begin}). Parameters ---------- lambda_t_log : :class:`~theano.tensor.TensorVariable` time series of the logarithm of the spreading rate, 1 or 2-dimensional. If 2-dimensional, the first dimension is time. pr_beta_I_begin : float or array_like Prior beta of the :class:`~pymc3.distributions.continuous.HalfCauchy` distribution of :math:`I(0)`. pr_beta_new_E_begin : float or array_like Prior beta of the :class:`~pymc3.distributions.continuous.HalfCauchy` distribution of :math:`E(0)`. pr_median_mu : float or array_like Prior for the median of the :class:`~pymc3.distributions.continuous.Lognormal` distribution of the recovery rate :math:`\mu`. pr_mean_median_incubation : Prior mean of the :class:`~pymc3.distributions.continuous.Normal` distribution of the median incubation delay :math:`d_{\text{incubation}}`. Defaults to 4 days [Nishiura2020]_, which is the median serial interval (the important measure here is not exactly the incubation period, but the delay until a person becomes infectious which seems to be about 1 day earlier as showing symptoms). pr_sigma_median_incubation : Prior sigma of the :class:`~pymc3.distributions.continuous.Normal` distribution of the median incubation delay :math:`d_{\text{incubation}}`. Default is 1 day. sigma_incubation : Scale parameter of the :class:`~pymc3.distributions.continuous.Lognormal` distribution of the incubation time/ delay until infectiousness. The default is set to 0.4, which is about the scale found in [Nishiura2020]_, [Lauer2020]_. pr_sigma_mu : float or array_like Prior for the sigma of the lognormal distribution of recovery rate :math:`\mu`. model : :class:`Cov19Model` if none, it is retrieved from the context return_all : bool if True, returns ``new_I_t``, ``new_E_t``, ``I_t``, ``S_t`` otherwise returns only ``new_I_t`` save_all : bool if True, saves ``new_I_t``, ``new_E_t``, ``I_t``, ``S_t`` in the trace, otherwise it saves only ``new_I_t`` name_median_incubation : str The name under which the median incubation time is saved in the trace Returns ------- new_I_t : :class:`~theano.tensor.TensorVariable` time series of the number daily newly infected persons. new_E_t : :class:`~theano.tensor.TensorVariable` time series of the number daily newly exposed persons. (if return_all set to True) I_t : :class:`~theano.tensor.TensorVariable` time series of the infected (if return_all set to True) S_t : :class:`~theano.tensor.TensorVariable` time series of the susceptible (if return_all set to True) References ---------- .. [Nishiura2020] Nishiura, H.; Linton, N. M.; Akhmetzhanov, A. R. Serial Interval of Novel Coronavirus (COVID-19) Infections. Int. J. Infect. Dis. 2020, 93, 284–286. https://doi.org/10.1016/j.ijid.2020.02.060. .. [Lauer2020] Lauer, S. A.; Grantz, K. H.; Bi, Q.; Jones, F. K.; Zheng, Q.; Meredith, H. R.; Azman, A. S.; Reich, N. G.; Lessler, J. The Incubation Period of Coronavirus Disease 2019 (COVID-19) From Publicly Reported Confirmed Cases: Estimation and Application. Ann Intern Med 2020. https://doi.org/10.7326/M20-0504. """ model = modelcontext(model) # Build prior distrubutions: # -------------------------- # Prior distribution of recovery rate mu mu = pm.Lognormal( name="mu", mu=np.log(pr_median_mu), sigma=pr_sigma_mu, ) # Total number of people in population N = model.N_population # Number of regions as tuple of int num_regions = () if model.sim_ndim == 1 else model.sim_shape[1] # Prior distributions of starting populations (exposed, infectious, susceptibles) # We choose to consider the transitions of newly exposed people of the last 10 days. if num_regions == (): new_E_begin = pm.HalfCauchy(name="new_E_begin", beta=pr_beta_new_E_begin, shape=11) else: new_E_begin = pm.HalfCauchy(name="new_E_begin", beta=pr_beta_new_E_begin, shape=(11, num_regions)) I_begin = pm.HalfCauchy(name="I_begin", beta=pr_beta_I_begin, shape=num_regions) S_begin = N - I_begin - pm.math.sum(new_E_begin, axis=0) lambda_t = tt.exp(lambda_t_log) new_I_0 = tt.zeros_like(I_begin) median_incubation = pm.Normal( name_median_incubation, mu=pr_mean_median_incubation, sigma=pr_sigma_median_incubation, ) # Choose transition rates (E to I) according to incubation period distribution if not num_regions: x = np.arange(1, 11) else: x = np.arange(1, 11)[:, None] beta = mh.tt_lognormal(x, tt.log(median_incubation), sigma_incubation) # Runs SEIR model: def next_day( lambda_t, S_t, nE1, nE2, nE3, nE4, nE5, nE6, nE7, nE8, nE9, nE10, I_t, _, mu, beta, N, ): new_E_t = lambda_t / N * I_t * S_t S_t = S_t - new_E_t new_I_t = (beta[0] * nE1 + beta[1] * nE2 + beta[2] * nE3 + beta[3] * nE4 + beta[4] * nE5 + beta[5] * nE6 + beta[6] * nE7 + beta[7] * nE8 + beta[8] * nE9 + beta[9] * nE10) I_t = I_t + new_I_t - mu * I_t I_t = tt.clip(I_t, 0, N) # for stability S_t = tt.clip(S_t, 0, N) return S_t, new_E_t, I_t, new_I_t # theano scan returns two tuples, first one containing a time series of # what we give in outputs_info : S, E's, I, new_I outputs, _ = theano.scan( fn=next_day, sequences=[lambda_t], outputs_info=[ S_begin, dict(initial=new_E_begin, taps=[-1, -2, -3, -4, -5, -6, -7, -8, -9, -10]), I_begin, new_I_0, ], non_sequences=[mu, beta, N], ) S_t, new_E_t, I_t, new_I_t = outputs pm.Deterministic("new_I_t", new_I_t) if save_all: pm.Deterministic("S_t", S_t) pm.Deterministic("I_t", I_t) pm.Deterministic("new_E_t", new_E_t) if return_all: return new_I_t, new_E_t, I_t, S_t else: return new_I_t
def _recog_exprs(self, inpt): """Return the exprssions of the recognition model.""" P = self.parameters.recog n_layers = len(self.n_hiddens_recog) hidden_to_hiddens = [ getattr(P, 'hidden_to_hidden_%i' % i) for i in range(n_layers - 1) ] hidden_biases = [ getattr(P, 'hidden_bias_%i' % i) for i in range(n_layers) ] initial_hidden_means_fwd = [ getattr(P, 'initial_hidden_means_fwd_%i' % i) for i in range(n_layers) ] initial_hidden_vars_fwd = [ getattr(P, 'initial_hidden_vars_fwd_%i' % i)**2 + 1e-4 for i in range(n_layers) ] initial_hidden_means_bwd = [ getattr(P, 'initial_hidden_means_bwd_%i' % i) for i in range(n_layers) ] initial_hidden_vars_bwd = [ getattr(P, 'initial_hidden_vars_bwd_%i' % i)**2 + 1e-4 for i in range(n_layers) ] recurrents_fwd = [ getattr(P, 'recurrent_fwd_%i' % i) for i in range(n_layers) ] recurrents_bwd = [ getattr(P, 'recurrent_bwd_%i' % i) for i in range(n_layers) ] p_dropouts = ([P.p_dropout.inpt] + P.p_dropout.hiddens + [P.p_dropout.hidden_to_out]) # Reparametrize to assert the rates lie in (0.025, 1-0.025). p_dropouts = [T.nnet.sigmoid(i) * 0.95 + 0.025 for i in p_dropouts] exprs = vpbrnn.exprs(inpt, T.zeros_like(inpt), P.in_to_hidden, hidden_to_hiddens, P.hidden_to_out, hidden_biases, [1 for _ in hidden_biases], initial_hidden_means_fwd, initial_hidden_vars_fwd, initial_hidden_means_bwd, initial_hidden_vars_bwd, recurrents_fwd, recurrents_bwd, P.out_bias, 1, self.recog_transfers, self.assumptions.statify_latent, p_dropouts=p_dropouts) exprs['inpt'] = inpt #to_shortcut = self.exprs['inpt'] to_shortcut = self.exprs['inpt'] shortcut = T.concatenate( [T.zeros_like(to_shortcut[:1]), to_shortcut[:-1]]) # Hic sunt dracones. # If we do not keep this line, Theano will die with a segfault. shortcut_empty = T.set_subtensor( T.zeros_like(shortcut)[:, :, :], shortcut) exprs['shortcut'] = shortcut_empty return exprs
def zeros_like(x): return T.zeros_like(x)
def statify_visible(self, X, var=None): if var is not None: return sigmoid(X, var) else: return sigmoid(X, T.zeros_like(X))
def nll_prior(self, X): X_flat = X.flatten() nll = -normal_logpdf(X_flat, T.zeros_like(X_flat), T.ones_like(X_flat)) return nll.reshape(X.shape)
def logp(self, value): return T.zeros_like(value)
def jobman(_options, channel=None): ################### PARSE INPUT ARGUMENTS ####################### o = parse_input_arguments(_options, 'RNN_theano/rnn_sinsum001/RNN_sumsin.ini') ####################### DEFINE THE TASK ######################### mode = Mode(linker='cvm_nogc', optimizer='fast_run') rng = numpy.random.RandomState(o['seed']) train_set = sumsin(T=o['task_T'], steps=o['task_steps'], batches=o['task_train_batches'], batch_size=o['task_train_batchsize'], noise=o['task_noise'], rng=rng) valid_set = sumsin(T=o['task_T'], steps=o['task_steps'], batches=o['task_valid_batches'], batch_size=o['task_valid_batchsize'], rng=rng) test_set = sumsin(T=o['task_T'], steps=o['task_steps'], batches=o['task_test_batches'], batch_size=o['task_test_batchsize'], rng=rng) if o['wout_pinv']: wout_set = sumsin(T=o['task_T'], steps=o['task_steps'], batches=o['task_wout_batches'], batch_size=o['task_wout_batchsize'], noise=o['task_wout_noise'], rng=rng) ###################### DEFINE THE MODEL ######################### def recurrent_fn(u_t, h_tm1, W_hh, W_ux, W_hy, b): x_t = TT.dot(W_ux, u_t) h_t = TT.tanh(TT.dot(W_hh, h_tm1) + x_t + b) #y_t = TT.dot(W_hy, h_t) return h_t #, y_t u = TT.matrix('u') if o['error_over_all']: t = TT.matrix('t') else: t = TT.matrix('t') h0 = TT.vector('h0') b = shared_shape( floatX( numpy.random.uniform(size=(o['nhid'], ), low=-o['Wux_properties']['scale'], high=o['Wux_properties']['scale']))) alpha = TT.scalar('alpha') lr = TT.scalar('lr') W_hh = init(o['nhid'], o['nhid'], 'W_hh', o['Whh_style'], o['Whh_properties'], rng) W_ux_mask = numpy.ones((o['nhid'], train_set.n_ins), dtype=theano.config.floatX) if o['Wux_mask_limit'] > 0: W_ux_mask[:o['Wux_mask_limit']] = 0. W_ux = init(o['nhid'], train_set.n_ins, 'W_ux', o['Wux_style'], o['Wux_properties'], rng, mask=W_ux_mask) W_hy = init(train_set.n_outs, o['nhid'], 'W_hy', o['Why_style'], o['Why_properties'], rng) h, _ = theano.scan(recurrent_fn, sequences=u, outputs_info=h0, non_sequences=[W_hh, W_ux, W_hy, b], name='recurrent_fn', mode=mode) y = TT.dot(W_hy, h.T) init_h = h.owner.inputs[0].owner.inputs[2] #h = theano.printing.Print('h',attrs=('shape',))(h) if o['error_over_all']: out_err = TT.mean((y - t)**2, axis=1) err = out_err.mean() else: out_err = ((y[-1] - t)**2).mean(axis=1) err = out_err.mean() # Regularization term if o['reg_projection'] == 'h[-1]': cost = h[-1].sum() elif o['reg_projection'] == 'err': cost = err elif o['reg_projection'] == 'random': trng = TT.shared_randomstreams.RandomStreams(rng.randint(1e6)) proj = trng.uniform(size=h[-1].shape) if o['sum_h2'] > 0: proj = TT.join(0, proj[:o['sum_h2']], TT.zeros_like(proj[o['sum_h2']:])) cost = TT.sum(proj * h[-1]) z, gh = TT.grad(cost, [init_h, h]) z.name = '__z__' #import GPUscan.ipdb; GPUscan.ipdb.set_trace() #z = z zsec = z[:-1] - gh if o['sum_h'] > 0: z2_1 = TT.sum(z[:, :o['sum_h']]**2, axis=1) z2_2 = TT.sum(zsec[:, :o['sum_h']]**2, axis=1) else: z2_1 = TT.sum(z**2, axis=1) z2_2 = TT.sum(zsec**2, axis=1) v1 = z2_2 v2 = z2_1[1:] ## ## v2 = theano.printing.Print('v2')(v2) # floatX(1e-14) ratios = TT.switch(TT.ge(v2, 1e-12), TT.sqrt(v1 / v2), floatX(1)) norm_0 = TT.ones_like(ratios[0]) norm_t, _ = theano.scan(lambda x, y: x * y, sequences=ratios, outputs_info=norm_0, name='jacobian_products', mode=mode) norm_term = TT.sum(norm_t) if o['reg_cost'] == 'product': r = abs(TT.log(norm_t)).sum() elif o['reg_cost'] == 'each': part1 = abs(TT.log(ratios)) part2 = TT.switch(TT.ge(v2, 1e-12), part1, 1 - v2) r = part2.sum() elif o['reg_cost'] == 'product2': ratios2 = TT.switch(TT.ge(z2[-1], 1e-12), TT.sqrt(z2 / z2[-1]), floatX(1)) r = abs(TT.log(ratios2)).sum() ratios = TT.switch(TT.ge(v2, 1e-12), TT.sqrt(v1 / v2), floatX(1e-12))[::-1] norm_0 = TT.ones_like(ratios[0]) norm_t, _ = theano.scan(lambda x, y: x * y, sequences=ratios, outputs_info=norm_0, name='jacobian_products', mode=mode) norm_term = floatX(0.1) + TT.sum(norm_t) gu = TT.grad(y[-1].sum(), u) if o['opt_alg'] == 'sgd': get_updates = lambda p,e, up : ( sgd(p , e , lr = lr , scale =\ TT.maximum( my1/norm_term, floatX(0.01)) , updates = up)[0] , [[],[],[TT.constant(0) for x in p]] ) elif o['opt_alg'] == 'sgd_qn': get_updates = lambda p, e, up: sgd_qn( p, e, mylambda=floatX(o['mylambda']), t0=floatX(o['t0']), skip=floatX(o['skip']), scale=TT.maximum(my1 / norm_term, floatX(0.01)), lazy=o['lazy'], updates=up) if o['win_reg']: updates, why_extra = get_updates([W_hy], err, {}) cost = err + alpha * r W_ux.name = 'W_ux' W_hh.name = 'W_hh' b.name = 'b' updates, extras = get_updates([W_ux, W_hh, b], cost, updates) updates[W_ux] = updates[W_ux] * W_ux_mask b_Why = why_extra[2][0] b_Wux = extras[2][0] b_Whh = extras[2][1] b_b = extras[2][2] else: updates, extras1 = get_updates([W_hy, W_ux], err, {}) updates[W_ux] = updates[W_ux] * W_ux_mask cost = err + alpha * r updates, extras2 = get_updates([W_hh, b], cost, updates) b_Why = extras1[2][0] b_Wux = extras1[2][1] b_Whh = extras2[2][0] b_b = extras2[2][1] nhid = o['nhid'] train_batchsize = o['task_train_batchsize'] valid_batchsize = o['task_valid_batchsize'] test_batchsize = o['task_test_batchsize'] wout_batchsize = o['task_wout_batchsize'] train_h0 = shared_shape(floatX(numpy.zeros((nhid, )))) valid_h0 = shared_shape(floatX(numpy.zeros((nhid, )))) test_h0 = shared_shape(floatX(numpy.zeros((nhid, )))) wout_h0 = shared_shape(floatX(numpy.zeros((nhid, )))) idx = TT.iscalar('idx') train_u, train_t = train_set(idx) u.tag.shape = copy.copy(train_u.tag.shape) t.tag.shape = copy.copy(train_t.tag.shape) train = theano.function([u, t, lr, alpha], [out_err, r, norm_term], updates=updates, mode=mode, givens={h0: train_h0}) valid_u, valid_t = valid_set(idx) u.tag.shape = copy.copy(valid_u.tag.shape) t.tag.shape = copy.copy(valid_t.tag.shape) valid = theano.function([u, t], [out_err, r, norm_term], mode=mode, givens={h0: valid_h0}) test_u, test_t = test_set(idx) u.tag.shape = copy.copy(test_u.tag.shape) t.tag.shape = copy.copy(test_t.tag.shape) test = theano.function([u, t], [ out_err, r, norm_term, W_hh, W_ux, W_hy, b, z, y, h, u, gu, t, b_Whh, b_Wux, b_Why, b_b, zsec, gh ], mode=mode, givens={h0: test_h0}) if o['wout_pinv']: wout_u, wout_t = wout_set.get_whole_tensors() def wiener_hopf_fn(u_t, t_t, H_tm1, Y_tm1, W_hh, W_ux, b, h0): def recurrent_fn(u_t, h_tm1, W_hh, W_ux, b): x_t = TT.dot(W_ux, u_t) h_t = TT.tanh(TT.dot(W_hh, h_tm1) + x_t + b) return h_t h_t, _ = theano.scan(recurrent_fn, sequences=u_t, outputs_info=h0, non_sequences=[W_hh, W_ux, b], name='recurrent_fn', mode=mode) H_t = H_tm1 + TT.dot(h_t[-1], h_t[-1].T) Y_t = Y_tm1 + TT.dot(h_t[-1], t_t.T) return H_t, Y_t H_0 = shared_shape(numpy.zeros((o['nhid'], o['nhid']), dtype=theano.config.floatX), name='H0') Y_0 = shared_shape(numpy.zeros((o['nhid'], 1), dtype=theano.config.floatX), name='Y0') all_u = TT.tensor4('whole_u') all_t = TT.tensor3('whole_t') [H, Y], _ = theano.scan( wiener_hopf_fn, sequences=[all_u, all_t], outputs_info=[H_0, Y_0], non_sequences=[W_hh, W_ux, TT.shape_padright(b), h0], name='wiener_hopf_fn', mode=mode) length = TT.cast(all_u.shape[0] * all_u.shape[3], dtype=theano.config.floatX) H = H[-1] / length Y = Y[-1] / length H = H + floatX(o['wiener_lambda']) * TT.eye(o['nhid']) W_hy_solve = theano_linalg.solve(H, Y).T wout = theano.function([idx], [], mode=mode, updates={W_hy: W_hy_solve}, givens={ all_u: wout_u, all_t: wout_t, h0: wout_h0 }) ''' theano.printing.pydotprint(train, 'train.png', high_contrast=True, with_ids= True) for idx,node in enumerate(train.maker.env.toposort()): if node.op.__class__.__name__ == 'Scan': theano.printing.pydotprint(node.op.fn, ('train%d_'%idx)+node.op.name, high_contrast = True, with_ids = True) theano.printing.pydotprint(train, 'valid.png', high_contrast=True, with_ids = True) for idx,node in enumerate(train.maker.env.toposort()): if node.op.__class__.__name__ == 'Scan': theano.printing.pydotprint(node.op.fn, ('valid%d_'%idx)+node.op.name, high_contrast = True, with_ids = True) theano.printing.pydotprint(train, 'test.png', high_contrast=True, with_ids = True) for idx,node in enumerate(train.maker.env.toposort()): if node.op.__class__.__name__ == 'Scan': theano.printing.pydotprint(node.op.fn, ('test%d_'%idx)+node.op.name, high_contrast = True, with_ids = True) if o['wout_pinv']: theano.printing.pydotprint(train, 'wout.png', high_contrast=True, with_ids = True) for idx,node in enumerate(train.maker.env.toposort()): if node.op.__class__.__name__ == 'Scan': theano.printing.pydotprint(node.op.fn, ('wout%d_'%idx)+node.op.name, high_contrast = True, with_ids= True) ''' #import GPUscan.ipdb; GPUscan.ipdb.set_trace() #rval = valid(valid_set.data_u[0],valid_set.data_t[0]) #################### DEFINE THE MAIN LOOP ####################### data = {} fix_len = o['max_storage_numpy'] #int(o['NN']/o['small_step']) avg_train_err = numpy.zeros((o['small_step'], train_set.n_outs)) avg_train_reg = numpy.zeros((o['small_step'], )) avg_train_norm = numpy.zeros((o['small_step'], )) avg_valid_err = numpy.zeros((o['small_step'], train_set.n_outs)) avg_valid_reg = numpy.zeros((o['small_step'], )) avg_valid_norm = numpy.zeros((o['small_step'], )) data['options'] = o data['train_err'] = -1 * numpy.ones((fix_len, train_set.n_outs)) data['valid_err'] = -1 * numpy.ones((fix_len, train_set.n_outs)) data['train_reg'] = -1 * numpy.ones((fix_len, )) data['valid_reg'] = -1 * numpy.ones((fix_len, )) data['train_norm'] = numpy.zeros((fix_len, )) data['valid_norm'] = numpy.zeros((fix_len, )) data['test_err'] = [None] * o['max_storage'] data['test_idx'] = [None] * o['max_storage'] data['test_reg'] = [None] * o['max_storage'] data['test_norm'] = [None] * o['max_storage'] data['y'] = [None] * o['max_storage'] data['z'] = [None] * o['max_storage'] data['t'] = [None] * o['max_storage'] data['h'] = [None] * o['max_storage'] data['u'] = [None] * o['max_storage'] data['gu'] = [None] * o['max_storage'] data['W_hh'] = [None] * o['max_storage'] data['W_ux'] = [None] * o['max_storage'] data['W_hy'] = [None] * o['max_storage'] data['b'] = [None] * o['max_storage'] data['b_ux'] = [None] * o['max_storage'] data['b_hy'] = [None] * o['max_storage'] data['b_hh'] = [None] * o['max_storage'] data['b_b'] = [None] * o['max_storage'] data['stuff'] = [] storage_exceeded = False stop = False old_rval = numpy.inf patience = o['patience'] n_train = o['task_train_batches'] n_valid = o['task_valid_batches'] n_test = o['task_test_batches'] n_test_runs = 0 test_pos = 0 valid_set.refresh() test_set.refresh() kdx = 0 lr_v = floatX(o['lr']) alpha_v = floatX(o['alpha']) lr_f = 1 if o['lr_scheme']: lr_f = o['lr_scheme'][1] / (o['NN'] - o['lr_scheme'][0]) alpha_r = 1 if o['alpha_scheme']: alpha_r = float(o['alpha_scheme'][1] - o['alpha_scheme'][0]) st = time.time() if channel: try: channel.save() except: pass for idx in xrange(int(o['NN'])): if o['lr_scheme'] and idx > o['lr_scheme'][0]: lr_v = floatX(o['lr'] * 1. / (1. + (idx - o['lr_scheme'][0]) * lr_f)) if o['alpha_scheme']: if idx < o['alpha_scheme'][0]: alpha_v = floatX(0) elif idx < o['alpha_scheme'][1]: pos = 2. * (idx - o['alpha_scheme'][0]) / alpha_r - 1. alpha_v = floatX(numpy.exp(-pos**2 / 0.2) * o['alpha']) else: alpha_v = floatX(0) jdx = idx % o['small_step'] avg_train_err[jdx, :] = 0 avg_train_reg[jdx] = 0 avg_train_norm[jdx] = 0 avg_valid_err[jdx, :] = 0 avg_valid_reg[jdx] = 0 avg_valid_norm[jdx] = 0 if o['wout_pinv'] and (idx % o['test_step'] == 0): wout_set.refresh() print( '* Re-computing W_hy using closed-form ' 'regularized wiener hopf formula') st_wout = time.time() wout(0) ed_wout = time.time() print '** It took ', ed_wout - st_wout, 'secs' print '** Average weight', abs(W_hy.get_value(borrow=True)).mean() for k in xrange(o['task_train_batches']): s, t = train_set.get_slice() rval = train(s, t, lr_v, alpha_v) print '[',idx,'/',patience,'][',k,'/',n_train,'][train]', rval[0].mean(), \ rval[1], rval[2], numpy.max([(1./rval[2]), 0.01])*lr_v, alpha_v avg_train_err[jdx, :] += rval[0] avg_train_reg[jdx] += rval[1] avg_train_norm[jdx] += rval[2] print '**Epoch took', time.time() - st, 'secs' avg_train_err[jdx] /= n_train avg_train_reg[jdx] /= n_train avg_train_norm[jdx] /= n_train st = time.time() for k in xrange(n_valid): rval = valid(*valid_set.get_slice()) print '[',idx,'/',patience,'][',k,'/',n_valid,'][valid]', rval[0].mean(), \ rval[1], rval[2] avg_valid_err[jdx] += rval[0] avg_valid_reg[jdx] += rval[1] avg_valid_norm[jdx] += rval[2] avg_valid_err[jdx] /= n_valid avg_valid_reg[jdx] /= n_valid avg_valid_norm[jdx] /= n_valid if idx >= o['small_step'] and idx % o['small_step'] == 0: kdx += 1 if kdx >= o['max_storage_numpy']: kdx = o['max_storage_numpy'] // 3 storage_exceeded = True data['steps'] = idx data['kdx'] = kdx data['storage_exceeded'] = storage_exceeded data['train_err'][kdx] = avg_train_err.mean() data['valid_err'][kdx] = avg_valid_err.mean() data['train_reg'][kdx] = avg_train_reg.mean() data['valid_reg'][kdx] = avg_valid_reg.mean() data['train_norm'][kdx] = avg_train_norm.mean() data['valid_norm'][kdx] = avg_valid_norm.mean() if channel: try: _options['trainerr'] = data['train_err'][kdx].mean() _options['maxtrainerr'] = data['train_err'][kdx].max() _options['trainreg'] = data['train_reg'][kdx] _options['trainnorm'] = data['train_norm'][kdx] _options['validerr'] = data['valid_err'][kdx].mean() _options['maxvaliderr'] = data['valid_err'][kdx].max() _options['validreg'] = data['valid_reg'][kdx] _options['validnorm'] = data['valid_norm'][kdx] _options['steps'] = idx _options['patience'] = patience channel.save() except: pass test_err = [] test_reg = [] test_norm = [] for k in xrange(n_test): rval = test(*test_set.get_slice()) print '[',idx,'][',k,'/',n_test,'][test]',rval[0].mean()\ , rval[1], rval[2] test_err += [rval[0]] test_reg += [rval[1]] test_norm += [rval[2]] test_z = rval[7][:, :] test_y = rval[8][:, :] test_h = rval[9][:, :] test_u = rval[10][:, :] test_gu = rval[11][:, :] test_t = rval[12][:, :] data['test_idx'][test_pos] = idx data['test_pos'] = test_pos data['y'][test_pos] = test_y data['z'][test_pos] = test_z data['t'][test_pos] = test_t data['h'][test_pos] = test_h data['u'][test_pos] = test_u data['gu'][test_pos] = test_gu data['test_err'][test_pos] = test_err data['test_reg'][test_pos] = test_reg data['test_norm'][test_pos] = test_norm data['W_hh'][test_pos] = rval[3] data['W_ux'][test_pos] = rval[4] data['W_hy'][test_pos] = rval[5] data['b'][test_pos] = rval[6] data['b_hh'][test_pos] = rval[13] data['b_ux'][test_pos] = rval[14] data['b_hy'][test_pos] = rval[15] data['b_b'][test_pos] = rval[16] data['stuff'] += [(rval[17], rval[18])] cPickle.dump( data, open( os.path.join(configs.results_folder(), o['path'], '%s_backup.pkl' % o['name']), 'wb')) print '** ', avg_valid_err[jdx].mean(), ' < ', old_rval, ' ? ' if avg_valid_err[jdx].mean() < old_rval: patience += o['patience_incr'] if avg_valid_err[jdx].mean() < old_rval: test_err = [] test_reg = [] test_norm = [] for k in xrange(n_test): rval = test(*test_set.get_slice()) print '[',idx,'][',k,'/',n_test,'][test]',rval[0].mean()\ , rval[1], rval[2] test_err += [rval[0]] test_reg += [rval[1]] test_norm += [rval[2]] test_z = rval[7][:, :] test_y = rval[8][:, :] test_h = rval[9][:, :] test_u = rval[10][:, :] test_gu = rval[11][:, :] test_t = rval[12][:, :] data['test_idx'][test_pos] = idx data['test_pos'] = test_pos data['y'][test_pos] = test_y data['z'][test_pos] = test_z data['t'][test_pos] = test_t data['h'][test_pos] = test_h data['u'][test_pos] = test_u data['gu'][test_pos] = test_gu data['test_err'][test_pos] = test_err data['test_reg'][test_pos] = test_reg data['test_norm'][test_pos] = test_norm data['W_hh'][test_pos] = rval[3] data['W_ux'][test_pos] = rval[4] data['W_hy'][test_pos] = rval[5] data['b'][test_pos] = rval[6] data['b_hh'][test_pos] = rval[13] data['b_ux'][test_pos] = rval[14] data['b_hy'][test_pos] = rval[15] data['b_b'][test_pos] = rval[16] data['stuff'] += [(rval[17], rval[18])] cPickle.dump( data, open( os.path.join(configs.results_folder(), o['path'], '%s.pkl' % o['name']), 'wb')) n_test_runs += 1 test_pos += 1 if test_pos >= o['max_storage']: test_pos = test_pos - o['go_back'] if numpy.mean(test_err) < 5e-5: patience = idx - 5 break old_rval = avg_valid_err[jdx].mean() if idx > patience: break
def train(random_seed=1234, dim_word=256, # word vector dimensionality ctx_dim=-1, # context vector dimensionality, auto set dim=1000, # the number of LSTM units n_layers_out=1, n_layers_init=1, encoder='none', encoder_dim=100, prev2out=False, ctx2out=False, patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., alpha_entropy_r=0., lrate=0.01, selector=False, n_words=100000, maxlen=100, # maximum length of the description optimizer='adadelta', clip_c=2., batch_size = 64, valid_batch_size = 64, save_model_dir='/data/lisatmp3/yaoli/exp/capgen_vid/attention/test/', validFreq=10, saveFreq=10, # save the parameters after every saveFreq updates sampleFreq=10, # generate some samples after every sampleFreq updates metric='blue', dataset='youtube2text', video_feature='googlenet', use_dropout=False, reload_=False, from_dir=None, K1=28, K2=10, OutOf=240, verbose=True, debug=True ): rng_numpy, rng_theano = utils.get_two_rngs() model_options = locals().copy() if 'self' in model_options: del model_options['self'] with open('%smodel_options.pkl'%save_model_dir, 'wb') as f: pkl.dump(model_options, f) # instance model layers = Layers() model = Model() print 'Loading data' engine = data_engine.Movie2Caption('attention', dataset, video_feature, batch_size, valid_batch_size, maxlen, n_words, K1, K2, OutOf) model_options['ctx_dim'] = engine.ctx_dim model_options['n_words'] = engine.n_words print 'n_words:', model_options['n_words'] # set test values, for debugging idx = engine.kf_train[0] [x_tv, mask_tv, ctx_tv, ctx_mask_tv, ctx_tv_c, ctx_mask_tv_c] = data_engine.prepare_data( engine, [engine.train[index] for index in idx]) print 'init params' t0 = time.time() params = model.init_params(model_options) # reloading if reload_: model_saved = from_dir+'/model_best_so_far.npz' assert os.path.isfile(model_saved) print "Reloading model params..." params = utils.load_params(model_saved, params) tparams = utils.init_tparams(params) trng, use_noise, \ x, mask, ctx, mask_ctx, ctx_c, mask_ctx_c, \ cost, extra = \ model.build_model(tparams, model_options) print 'build model done!' alphas = extra[1] alphas_c = extra[2] betas = extra[3] betas_c = extra[4] print 'buliding sampler' f_init, f_next = model.build_sampler(tparams, model_options, use_noise, trng) # before any regularizer print 'building f_log_probs' f_log_probs = theano.function([x, mask, ctx, mask_ctx, ctx_c, mask_ctx_c], -cost, profile=False, on_unused_input='ignore') cost = cost.mean() if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1. - alphas.sum(0)) ** 2).sum(-1).mean() cost += alpha_reg alpha_reg_c = alpha_c * ((1. - alphas_c.sum(0)) ** 2).sum(-1).mean() cost += alpha_reg_c if alpha_entropy_r > 0: alpha_entropy_r = theano.shared(numpy.float32(alpha_entropy_r), name='alpha_entropy_r') alpha_reg_2 = alpha_entropy_r * (-tensor.sum(alphas * tensor.log(alphas+1e-8),axis=-1)).sum(-1).mean() cost += alpha_reg_2 else: alpha_reg_2 = tensor.zeros_like(cost) print 'building f_alpha' f_alpha = theano.function([x, mask, ctx, ctx_c, mask_ctx, mask_ctx_c], [alphas, alphas_c, betas, betas_c], name='f_alpha', on_unused_input='ignore') print 'compute grad' grads = tensor.grad(cost, wrt=utils.itemlist(tparams)) if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'build train fns' f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, [x, mask, ctx, ctx_c, mask_ctx, mask_ctx_c], cost, extra + grads) print 'compilation took %.4f sec'%(time.time()-t0) print 'Optimization' history_errs = [] # reload history if reload_: print 'loading history error...' history_errs = numpy.load( from_dir+'model_best_so_far.npz')['history_errs'].tolist() bad_counter = 0 processes = None queue = None rqueue = None shared_params = None uidx = 0 uidx_best_blue = 0 uidx_best_valid_err = 0 estop = False best_p = utils.unzip(tparams) best_blue_valid = 0 best_valid_err = 999 alphas_ratio = [] for eidx in xrange(max_epochs): n_samples = 0 train_costs = [] grads_record = [] print 'Epoch ', eidx for idx in engine.kf_train: tags = [engine.train[index] for index in idx] n_samples += len(tags) uidx += 1 use_noise.set_value(1.) pd_start = time.time() x, mask, ctx, ctx_mask, ctx_c, ctx_mask_c = data_engine.prepare_data( engine, tags) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue ud_start = time.time() rvals = f_grad_shared(x, mask, ctx, ctx_c, ctx_mask, ctx_mask_c) cost = rvals[0] probs = rvals[1] alphas = rvals[2] alphas_c = rvals[3] betas = rvals[4] betas_c = rvals[5] grads = rvals[6:] grads, NaN_keys = utils.grad_nan_report(grads, tparams) if len(grads_record) >= 5: del grads_record[0] grads_record.append(grads) if NaN_keys != []: print 'grads contain NaN' import pdb; pdb.set_trace() if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected in cost' import pdb; pdb.set_trace() # update params f_update(lrate) ud_duration = time.time() - ud_start if eidx == 0: train_error = cost else: train_error = train_error * 0.95 + cost * 0.05 train_costs.append(cost) if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Train cost mean so far', \ train_error, 'fetching data time spent (sec)', pd_duration, \ 'update time spent (sec)', ud_duration, 'save_dir', save_model_dir alphas, alphas_c, betas, betas_c = f_alpha(x, mask, ctx, ctx_c, ctx_mask, ctx_mask_c) counts = mask.sum(0) betas_mean = (betas * mask).sum(0) / counts betas_mean = betas_mean.mean() betas_mean_c = (betas_c * mask).sum(0) / counts betas_mean_c = betas_mean_c.mean() print 'alpha ratio %.3f, betas mean %.3f'%( alphas.min(-1).mean() / (alphas.max(-1)).mean(), betas_mean) l = 0 for vv in x[:, 0]: if vv == 0: break if vv in engine.word_idict: print '(', numpy.round(betas[l, 0], 3), ')', engine.word_idict[vv], else: print '(', numpy.round(betas[l, 0], 3), ')', 'UNK', l += 1 print '(', numpy.round(betas[l, 0], 3), ')' if numpy.mod(uidx, saveFreq) == 0: pass if numpy.mod(uidx, sampleFreq) == 0: use_noise.set_value(0.) print '------------- sampling from train ----------' x_s = x mask_s = mask ctx_s = ctx ctx_s_c = ctx_c ctx_mask_s = ctx_mask ctx_mask_s_c = ctx_mask_c model.sample_execute(engine, model_options, tparams, f_init, f_next, x_s, ctx_s, ctx_s_c, ctx_mask_s, ctx_mask_s_c, trng) print '------------- sampling from valid ----------' idx = engine.kf_valid[numpy.random.randint(1, len(engine.kf_valid) - 1)] tags = [engine.valid[index] for index in idx] x_s, mask_s, ctx_s, mask_ctx_s, ctx_s_c, mask_ctx_s_c = data_engine.prepare_data(engine, tags) model.sample_execute(engine, model_options, tparams, f_init, f_next, x_s, ctx_s, ctx_s_c, mask_ctx_s, mask_ctx_s_c, trng) if validFreq != -1 and numpy.mod(uidx, validFreq) == 0: t0_valid = time.time() alphas, alphas_c, _, _ = f_alpha(x, mask, ctx, ctx_c, ctx_mask, ctx_mask_c) ratio = alphas.min(-1).mean()/(alphas.max(-1)).mean() alphas_ratio.append(ratio) numpy.savetxt(save_model_dir+'alpha_ratio.txt',alphas_ratio) current_params = utils.unzip(tparams) numpy.savez( save_model_dir+'model_current.npz', history_errs=history_errs, **current_params) use_noise.set_value(0.) train_err = -1 train_perp = -1 valid_err = -1 valid_perp = -1 test_err = -1 test_perp = -1 if not debug: # first compute train cost if 0: print 'computing cost on trainset' train_err, train_perp = model.pred_probs( engine, 'train', f_log_probs, verbose=model_options['verbose']) else: train_err = 0. train_perp = 0. if 1: print 'validating...' valid_err, valid_perp = model.pred_probs( engine, 'valid', f_log_probs, verbose=model_options['verbose'], ) else: valid_err = 0. valid_perp = 0. if 1: print 'testing...' test_err, test_perp = model.pred_probs( engine, 'test', f_log_probs, verbose=model_options['verbose'] ) else: test_err = 0. test_perp = 0. mean_ranking = 0 blue_t0 = time.time() scores, processes, queue, rqueue, shared_params = \ metrics.compute_score( model_type='attention', model_archive=current_params, options=model_options, engine=engine, save_dir=save_model_dir, beam=5, n_process=5, whichset='both', on_cpu=False, processes=processes, queue=queue, rqueue=rqueue, shared_params=shared_params, metric=metric, one_time=False, f_init=f_init, f_next=f_next, model=model ) ''' {'blue': {'test': [-1], 'valid': [77.7, 60.5, 48.7, 38.5, 38.3]}, 'alternative_valid': {'Bleu_3': 0.40702270203174923, 'Bleu_4': 0.29276570520368456, 'CIDEr': 0.25247168210607884, 'Bleu_2': 0.529069629270047, 'Bleu_1': 0.6804308797115253, 'ROUGE_L': 0.51083584331688392}, 'meteor': {'test': [-1], 'valid': [0.282787550236724]}} ''' valid_B1 = scores['valid']['Bleu_1'] valid_B2 = scores['valid']['Bleu_2'] valid_B3 = scores['valid']['Bleu_3'] valid_B4 = scores['valid']['Bleu_4'] valid_Rouge = scores['valid']['ROUGE_L'] valid_Cider = scores['valid']['CIDEr'] valid_meteor = scores['valid']['METEOR'] test_B1 = scores['test']['Bleu_1'] test_B2 = scores['test']['Bleu_2'] test_B3 = scores['test']['Bleu_3'] test_B4 = scores['test']['Bleu_4'] test_Rouge = scores['test']['ROUGE_L'] test_Cider = scores['test']['CIDEr'] test_meteor = scores['test']['METEOR'] print 'computing meteor/blue score used %.4f sec, '\ 'blue score: %.1f, meteor score: %.1f'%( time.time()-blue_t0, valid_B4, valid_meteor) history_errs.append([eidx, uidx, train_err, train_perp, valid_perp, test_perp, valid_err, test_err, valid_B1, valid_B2, valid_B3, valid_B4, valid_meteor, valid_Rouge, valid_Cider, test_B1, test_B2, test_B3, test_B4, test_meteor, test_Rouge, test_Cider]) numpy.savetxt(save_model_dir+'train_valid_test.txt', history_errs, fmt='%.3f') print 'save validation results to %s'%save_model_dir # save best model according to the best blue or meteor if len(history_errs) > 1 and \ valid_B4 > numpy.array(history_errs)[:-1,11].max(): print 'Saving to %s...'%save_model_dir, numpy.savez( save_model_dir+'model_best_blue_or_meteor.npz', history_errs=history_errs, **best_p) if len(history_errs) > 1 and \ valid_err < numpy.array(history_errs)[:-1,6].min(): best_p = utils.unzip(tparams) bad_counter = 0 best_valid_err = valid_err uidx_best_valid_err = uidx print 'Saving to %s...'%save_model_dir, numpy.savez( save_model_dir+'model_best_so_far.npz', history_errs=history_errs, **best_p) with open('%smodel_options.pkl'%save_model_dir, 'wb') as f: pkl.dump(model_options, f) print 'Done' elif len(history_errs) > 1 and \ valid_err >= numpy.array(history_errs)[:-1,6].min(): bad_counter += 1 print 'history best ',numpy.array(history_errs)[:,6].min() print 'bad_counter ',bad_counter print 'patience ',patience if bad_counter > patience: print 'Early Stop!' estop = True break if test_B4>0.52 and test_meteor>0.32: print 'Saving to %s...'%save_model_dir, numpy.savez( save_model_dir+'model_'+str(uidx)+'.npz', history_errs=history_errs, **current_params) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \ 'best valid err so far',best_valid_err #print 'valid took %.2f sec'%(time.time() - t0_valid) # end of validatioin if debug: break if estop: break if debug: break # end for loop over minibatches print 'This epoch has seen %d samples, train cost %.2f'%( n_samples, numpy.mean(train_costs)) # end for loop over epochs print 'Optimization ended.' if best_p is not None: utils.zipp(best_p, tparams) use_noise.set_value(0.) valid_err = 0 test_err = 0 if not debug: #if valid: valid_err, valid_perp = model.pred_probs( engine, 'valid', f_log_probs, verbose=model_options['verbose']) #if test: #test_err, test_perp = self.pred_probs( # 'test', f_log_probs, # verbose=model_options['verbose']) print 'stopped at epoch %d, minibatch %d, '\ 'curent Train %.2f, current Valid %.2f, current Test %.2f '%( eidx,uidx,numpy.mean(train_err),numpy.mean(valid_err),numpy.mean(test_err)) params = copy.copy(best_p) numpy.savez(save_model_dir+'model_best.npz', train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) if history_errs != []: history = numpy.asarray(history_errs) best_valid_idx = history[:,6].argmin() numpy.savetxt(save_model_dir+'train_valid_test.txt', history, fmt='%.4f') print 'final best exp ', history[best_valid_idx] return train_err, valid_err, test_err
def SIR( lambda_t_log, pr_I_begin=100, pr_median_mu=1 / 8, pr_sigma_mu=0.2, model=None, return_all=False, save_all=False, ): r""" Implements the susceptible-infected-recovered model. .. math:: I_{new}(t) &= \lambda_t I(t-1) \frac{S(t-1)}{N} \\ S(t) &= S(t-1) - I_{new}(t) \\ I(t) &= I(t-1) + I_{new}(t) - \mu I(t) The prior distribution of the recovery rate :math:`\mu` is set to :math:`LogNormal(\text{log(pr\_median\_mu)), pr\_sigma\_mu})`. And the prior distribution of :math:`I(0)` to :math:`HalfCauchy(\text{pr\_beta\_I\_begin})` Parameters ---------- lambda_t_log : :class:`~theano.tensor.TensorVariable` time series of the logarithm of the spreading rate, 1 or 2-dimensional. If 2-dimensional the first dimension is time. pr_I_begin : float or array_like or :class:`~theano.tensor.TensorVariable` Prior beta of the Half-Cauchy distribution of :math:`I(0)`. pr_median_mu : float or array_like Prior for the median of the lognormal distrubution of the recovery rate :math:`\mu`. pr_sigma_mu : float or array_like Prior for the sigma of the lognormal distribution of recovery rate :math:`\mu`. model : :class:`Cov19Model` if none, it is retrieved from the context return_all : bool if True, returns ``new_I_t``, ``I_t``, ``S_t`` otherwise returns only ``new_I_t`` save_all : bool if True, saves ``new_I_t``, ``I_t``, ``S_t`` in the trace, otherwise it saves only ``new_I_t`` Returns ------- new_I_t : :class:`~theano.tensor.TensorVariable` time series of the number daily newly infected persons. I_t : :class:`~theano.tensor.TensorVariable` time series of the infected (if return_all set to True) S_t : :class:`~theano.tensor.TensorVariable` time series of the susceptible (if return_all set to True) """ model = modelcontext(model) # Build prior distributions: mu = pm.Lognormal(name="mu", mu=np.log(pr_median_mu), sigma=pr_sigma_mu) # Total number of people in population N = model.N_population # Number of regions as tuple of int num_regions = () if model.sim_ndim == 1 else model.sim_shape[1] # Prior distributions of starting populations (infectious, susceptibles) if isinstance(pr_I_begin, tt.TensorVariable): I_begin = pr_I_begin else: I_begin = pm.HalfCauchy(name="I_begin", beta=pr_I_begin, shape=num_regions) S_begin = N - I_begin lambda_t = tt.exp(lambda_t_log) new_I_0 = tt.zeros_like(I_begin) # Runs SIR model: def next_day(lambda_t, S_t, I_t, _, mu, N): new_I_t = lambda_t / N * I_t * S_t S_t = S_t - new_I_t I_t = I_t + new_I_t - mu * I_t I_t = tt.clip(I_t, -1, N) # for stability S_t = tt.clip(S_t, 0, N) return S_t, I_t, new_I_t # theano scan returns two tuples, first one containing a time series of # what we give in outputs_info : S, I, new_I outputs, _ = theano.scan( fn=next_day, sequences=[lambda_t], outputs_info=[S_begin, I_begin, new_I_0], non_sequences=[mu, N], ) S_t, I_t, new_I_t = outputs pm.Deterministic("new_I_t", new_I_t) if save_all: pm.Deterministic("S_t", S_t) pm.Deterministic("I_t", I_t) if return_all: return new_I_t, I_t, S_t else: return new_I_t
def grad(self, inputs, output_grads): return [tensor.zeros_like(ii, dtype=theano.config.floatX) for ii in inputs]
def make_Q(i, j, tps, Q, reward, v): Q_template = T.zeros_like(Q) tp = transition_probabilities[i, j, :] return T.set_subtensor(Q_template[i, j], tp.dot(reward + discount * v)), {}
def __init__(self, voca_size, hidden_size, lstm_layers_num, learning_rate=0.2): self.voca_size = voca_size self.hidden_size = hidden_size self.lstm_layers_num = lstm_layers_num self.learning_rate = learning_rate self._train = None self._utter = None self.params = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.hos = [] self.Cos = [] encoderInputs, encoderMask = tensor.imatrices(2) decoderInputs, decoderMask, decoderTarget = tensor.imatrices(3) self.lookuptable = theano.shared(name="Encoder LookUpTable", value=utils.init_norm( self.voca_size, self.hidden_size), borrow=True) self.linear = theano.shared(name="Linear", value=utils.init_norm( self.hidden_size, self.voca_size), borrow=True) self.params += [self.lookuptable, self.linear] #concatenate #(max_sent_size, batch_size, hidden_size) state_below = self.lookuptable[encoderInputs.flatten()].reshape( (encoderInputs.shape[0], encoderInputs.shape[1], self.hidden_size)) for _ in range(self.lstm_layers_num): enclstm = LSTM(self.hidden_size) self.encoder_lstm_layers += enclstm, #append self.params += enclstm.params #concatenate hs, Cs = enclstm.forward(state_below, encoderMask) self.hos += hs[-1], self.Cos += Cs[-1], state_below = hs state_below = self.lookuptable[decoderInputs.flatten()].reshape( (decoderInputs.shape[0], decoderInputs.shape[1], self.hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co) decoder_lstm_outputs = state_below ei, em, di, dm, dt = tensor.imatrices(5) #place holders ##################################################### ##################################################### linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) softmax_outputs, updates = theano.scan( fn=lambda x: tensor.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * tensor.log(pred[tensor.arange(decoderInputs.shape[1]), y]) costs, updates = theano.scan( fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask]) loss = costs.sum() / decoderMask.sum() gparams = [tensor.grad(loss, param) for param in self.params] updates = [(param, param - self.learning_rate * gparam) for param, gparam in zip(self.params, gparams)] self._train = theano.function(inputs=[ei, em, di, dm, dt], outputs=[loss, costs], updates=updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs: di, decoderMask: dm, decoderTarget: dt }) ##################################################### ##################################################### hs0, Cs0 = tensor.as_tensor_variable( self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0") token_idxs = tensor.fill( tensor.zeros_like(decoderInputs, dtype="int32"), utils.idx_start) msk = tensor.fill((tensor.zeros_like(decoderInputs, dtype="int32")), 1) def _step(token_idxs, hs_, Cs_): hs, Cs = [], [] state_below = self.lookuptable[token_idxs].reshape( (decoderInputs.shape[0], decoderInputs.shape[1], self.hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below, msk, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) next_token_idx = tensor.cast( tensor.dot(state_below, self.linear).argmax(axis=-1), "int32") return next_token_idx, hs, Cs outputs, updates = theano.scan(fn=_step, outputs_info=[token_idxs, hs0, Cs0], n_steps=utils.max_sent_size) listof_token_idx = outputs[0] self._utter = theano.function( inputs=[ei, em, di], outputs=listof_token_idx, givens={ encoderInputs: ei, encoderMask: em, decoderInputs: di } #givens={encoderInputs:ei, encoderMask:em} )
def __init__(self, num_actions, phi_length, width, height, discount, learning_rate, decay, momentum=0, batch_size=32, approximator='none'): self._batch_size = batch_size self._num_input_features = phi_length self._phi_length = phi_length self._img_width = width self._img_height = height self._discount = discount self.num_actions = num_actions self.learning_rate = learning_rate self.decay = decay self.momentum = momentum self.scale_input_by = 255.0 # CONSTRUCT THE LAYERS self.q_layers = [] self.q_layers.append( layers.Input2DLayer(self._batch_size, self._num_input_features, self._img_height, self._img_width, self.scale_input_by)) if approximator == 'cuda_conv': self.q_layers.append( cc_layers.ShuffleBC01ToC01BLayer(self.q_layers[-1])) self.q_layers.append( cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1], n_filters=16, filter_size=8, stride=4, weights_std=.01, init_bias_value=0.1)) self.q_layers.append( cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1], n_filters=32, filter_size=4, stride=2, weights_std=.01, init_bias_value=0.1)) self.q_layers.append( cc_layers.ShuffleC01BToBC01Layer(self.q_layers[-1])) elif approximator == 'conv': self.q_layers.append( layers.StridedConv2DLayer(self.q_layers[-1], n_filters=16, filter_width=8, filter_height=8, stride_x=4, stride_y=4, weights_std=.01, init_bias_value=0.01)) self.q_layers.append( layers.StridedConv2DLayer(self.q_layers[-1], n_filters=32, filter_width=4, filter_height=4, stride_x=2, stride_y=2, weights_std=.01, init_bias_value=0.01)) if approximator == 'cuda_conv' or approximator == 'conv': self.q_layers.append( layers.DenseLayer(self.q_layers[-1], n_outputs=256, weights_std=0.01, init_bias_value=0.1, dropout=0, nonlinearity=layers.rectify)) self.q_layers.append( layers.DenseLayer(self.q_layers[-1], n_outputs=num_actions, weights_std=0.01, init_bias_value=0.1, dropout=0, nonlinearity=layers.identity)) if approximator == 'none': self.q_layers.append(\ layers.DenseLayerNoBias(self.q_layers[-1], n_outputs=num_actions, weights_std=0.00, dropout=0, nonlinearity=layers.identity)) self.q_layers.append(layers.OutputLayer(self.q_layers[-1])) for i in range(len(self.q_layers) - 1): print self.q_layers[i].get_output_shape() # Now create a network (using the same weights) # for next state q values self.next_layers = copy_layers(self.q_layers) self.next_layers[0] = layers.Input2DLayer(self._batch_size, self._num_input_features, self._img_width, self._img_height, self.scale_input_by) self.next_layers[1].input_layer = self.next_layers[0] self.rewards = T.col() self.actions = T.icol() # Build the loss function ... q_vals = self.q_layers[-1].predictions() next_q_vals = self.next_layers[-1].predictions() next_maxes = T.max(next_q_vals, axis=1, keepdims=True) target = self.rewards + discount * next_maxes target = theano.gradient.consider_constant(target) diff = target - q_vals # Zero out all entries for actions that were not chosen... mask = build_mask(T.zeros_like(diff), self.actions, 1.0) diff_masked = diff * mask error = T.mean(diff_masked**2) self._loss = error * diff_masked.shape[1] # self._parameters = layers.all_parameters(self.q_layers[-1]) self._idx = T.lscalar('idx') # CREATE VARIABLES FOR INPUT AND OUTPUT self.states_shared = theano.shared( np.zeros((1, 1, 1, 1), dtype=theano.config.floatX)) self.states_shared_next = theano.shared( np.zeros((1, 1, 1, 1), dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros( (1, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((1, 1), dtype='int32'), broadcastable=(False, True)) self._givens = \ {self.q_layers[0].input_var: self.states_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :, :, :], self.next_layers[0].input_var: self.states_shared_next[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :, :, :], self.rewards: self.rewards_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :], self.actions: self.actions_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :] } if self.momentum != 0: self._updates = layers.gen_updates_rmsprop_and_nesterov_momentum(\ self._loss, self._parameters, learning_rate=self.learning_rate, rho=self.decay, momentum=self.momentum, epsilon=1e-6) else: self._updates = layers.gen_updates_rmsprop( self._loss, self._parameters, learning_rate=self.learning_rate, rho=self.decay, epsilon=1e-6) self._train = theano.function([self._idx], self._loss, givens=self._givens, updates=self._updates) self._compute_loss = theano.function([self._idx], self._loss, givens=self._givens) self._compute_q_vals = \ theano.function([self.q_layers[0].input_var], self.q_layers[-1].predictions(), on_unused_input='ignore')
def shift_right(x): return TT.concatenate([TT.shape_padleft(TT.zeros_like(x[0])), x[:-1]])
def build_model(tparams, options): opt_ret = dict() trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples x = tensor.matrix('x', dtype='int64') x_mask = tensor.matrix('x_mask', dtype='float32') y = tensor.matrix('y', dtype='int64') y_mask = tensor.matrix('y_mask', dtype='float32') # for the backward rnn, we just need to invert x and x_mask xr = x[::-1] xr_mask = x_mask[::-1] n_timesteps = x.shape[0] n_timesteps_trg = y.shape[0] n_samples = x.shape[1] # word embedding for forward rnn (source) emb = tparams['Wemb'][x.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder', mask=x_mask) # word embedding for backward rnn (source) embr = tparams['Wemb'][xr.flatten()] embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) projr = get_layer(options['encoder'])[1](tparams, embr, options, prefix='encoder_r', mask=xr_mask) # context will be the concatenation of forward and backward rnns ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1) # mean of the context (across time) will be used to initialize decoder rnn ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] # or you can use the last state of forward + backward encoder rnns # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2) # initial decoder state init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') # word embedding (target), we will shift the target sequence one time step # to the right. This is done because of the bi-gram connections in the # readout and decoder rnn. The first target will be all zeros and we will # not condition on the last output. emb = tparams['Wemb_dec'][y.flatten()] emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted # decoder - pass through the decoder conditional gru with attention proj = get_layer(options['decoder'])[1](tparams, emb, options, prefix='decoder', mask=y_mask, context=ctx, context_mask=x_mask, one_step=False, init_state=init_state) # hidden states of the decoder gru proj_h = proj[0] # weighted averages of context, generated by attention module ctxs = proj[1] # weights (alignment matrix) opt_ret['dec_alphas'] = proj[2] # compute word probabilities logit_lstm = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') logit_prev = get_layer('ff')[1](tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit_ctx = get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax( logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]])) # cost y_flat = y.flatten() y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat cost = -tensor.log(probs.flatten()[y_flat_idx]) cost = cost.reshape([y.shape[0], y.shape[1]]) cost = (cost * y_mask).sum(0) return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, memory_hops, dim, mode, input_mask_mode, l2, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.input_mask_mode = input_mask_mode self.l2 = l2 self.batch_norm = batch_norm self.dropout = dropout self.memory_hops = memory_hops self.train_input, self.train_q, self.train_answer, self.train_input_mask, self.train_gates = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask, self.test_gates = self._process_input( babi_test_raw) self.vocab_size = len(self.vocab) print "Train size: ", len(self.train_input) print "Test size: ", len(self.test_input) print "Vocab size: ", self.vocab_size self.input_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') self.gates_var = T.ivector( 'gates_var') # attention gate (including end_reading) self.attentions = [] print "==> building input module" self.W_inp_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.input_var, outputs_info=T.zeros_like( self.b_inp_hid)) self.end_reading = nn_utils.constant_param(value=0.0, shape=(1, self.dim)) inp_c_tag = T.concatenate([inp_c_history, self.end_reading], axis=0) self.inp_c = inp_c_tag.take(self.input_mask_var, axis=0) #(facts_len,dim) self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] #(1,dim) print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 2)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(0, self.memory_hops): current_episode, g = self.new_episode(memory[iter]) self.attentions.append(g) memory.append( self.GRU_update(memory[iter], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle(('x', 0)) net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] self.attentions = T.stack(self.attentions) #(memory_hops, fact_cnt) print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) print "==> collecting all parameters" self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy( self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss_gate = T.nnet.categorical_crossentropy( self.attentions, self.gates_var).mean() self.loss = self.loss_ce + self.loss_l2 + self.loss_gate updates = lasagne.updates.adam(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var, self.gates_var ], allow_input_downcast=True, outputs=[self.prediction, self.loss, self.attentions], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var, self.gates_var ], allow_input_downcast=True, outputs=[self.prediction, self.loss, self.attentions]) if self.mode == 'train': print "==> computing gradients (for debugging)" gradient = T.grad(self.loss, self.params) self.get_gradient_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var, self.gates_var ], allow_input_downcast=True, outputs=gradient)
def get_elementwise_objective( policy, actions, rewards, is_alive="always", baseline="zeros", gamma_or_gammas=0.99, crop_last=True, treat_policy_as_logpolicy=False, ): """ Compute and return policy gradient as evaluates L_policy = - log(policy) * (V_reference - baseline) L_V = (V - Vreference)^2 :param policy: [batch,tick,action_id] - predicted action probabilities either for all actions, shape [batch,tick,action] or for chosen actions, shape [batch,tick] :param actions: [batch,tick] - committed actions :param rewards: [batch,tick] - immediate rewards for taking actions at given time ticks :param is_alive: [batch,tick] - binary matrix whether given session is active at given tick. Defaults to all ones. :param baseline: [batch,tick] - REINFORCE baselines tensor for each batch/tick. Uses no baseline by default. :param gamma_or_gammas: a single value or array[batch,tick](can broadcast dimensions) of delayed reward discounts :param crop_last: if True, zeros-out loss at final tick :param treat_policy_as_logpolicy: if True, policy is used as log(pi(a|s)). You may want to do this for numerical stability reasons. :return: elementwise sum of policy_loss + state_value_loss [batch,tick] """ if is_alive == "always": is_alive = T.ones_like(actions, dtype=theano.config.floatX) if baseline == "zeros": baseline = T.zeros_like(rewards, dtype=theano.config.floatX) # check dimensions assert policy.ndim in (2,3),"policy must have shape either [batch,tick,action], for all actions," \ " or [batch,tick], for chosen actions" assert actions.ndim == rewards.ndim == is_alive.ndim == 2, "actions, rewards and is_alive must have shape [batch,time]" #logprobas for all actions logpolicy = T.log(policy) if not treat_policy_as_logpolicy else policy #logprobas for actions taken given_action_probas = (logpolicy.ndim == 2) action_logprobas = logpolicy if given_action_probas else get_values_for_actions( logpolicy, actions) #estimate n-step advantage. Note that we use current state values here (and not e.g. state_values_target) observed_state_values = get_n_step_value_reference( state_values=T.zeros_like(rewards, dtype=theano.config.floatX), rewards=rewards, is_alive=is_alive, n_steps=None, gamma_or_gammas=gamma_or_gammas, end_at_tmax=True, crop_last=crop_last, ) advantage = consider_constant(observed_state_values - baseline) loss_elwise = -action_logprobas * advantage * is_alive return loss_elwise
def build(self): # Source sentences: n_timesteps, n_samples x = tensor.matrix('x', dtype=INT) x_mask = tensor.matrix('x_mask', dtype=FLOAT) # Image: 196 (n_annotations) x n_samples x 1024 (conv_dim) x_img = tensor.tensor3('x_img', dtype=FLOAT) # Target sentences: n_timesteps, n_samples y = tensor.matrix('y', dtype=INT) y_mask = tensor.matrix('y_mask', dtype=FLOAT) # Reverse stuff xr = x[::-1] xr_mask = x_mask[::-1] # Some shorthands for dimensions n_samples = x.shape[1] n_timesteps = x.shape[0] n_timesteps_trg = y.shape[0] # Store tensors self.inputs = OrderedDict() self.inputs['x'] = x # Source words self.inputs['x_mask'] = x_mask # Source mask self.inputs['x_img'] = x_img # Image features self.inputs['y'] = y # Target labels self.inputs['y_mask'] = y_mask # Target mask ################### # Source embeddings ################### # word embedding for forward rnn (source) emb = dropout(self.tparams['Wemb_enc'][x.flatten()], self.trng, self.emb_dropout, self.use_dropout) emb = emb.reshape([n_timesteps, n_samples, self.embedding_dim]) forw = get_new_layer('gru')[1](self.tparams, emb, prefix='text_encoder', mask=x_mask, layernorm=self.lnorm) # word embedding for backward rnn (source) embr = dropout(self.tparams['Wemb_enc'][xr.flatten()], self.trng, self.emb_dropout, self.use_dropout) embr = embr.reshape([n_timesteps, n_samples, self.embedding_dim]) back = get_new_layer('gru')[1](self.tparams, embr, prefix='text_encoder_r', mask=xr_mask, layernorm=self.lnorm) # Source context will be the concatenation of forward and backward rnns # leading to a vector of 2*rnn_dim for each timestep text_ctx = tensor.concatenate([forw[0], back[0][::-1]], axis=forw[0].ndim - 1) # -> n_timesteps x n_samples x 2*rnn_dim # Apply dropout text_ctx = dropout(text_ctx, self.trng, self.ctx_dropout, self.use_dropout) if self.init_cgru == 'text': # mean of the context (across time) will be used to initialize decoder rnn text_ctx_mean = ( text_ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] # -> n_samples x ctx_dim (2*rnn_dim) # initial decoder state computed from source context mean init_state = get_new_layer('ff')[1](self.tparams, text_ctx_mean, prefix='ff_text_state_init', activ='tanh') # -> n_samples x rnn_dim (last dim shrinked down by this FF to rnn_dim) elif self.init_cgru == 'img': # Reduce to nb_samples x conv_dim and transform init_state = get_new_layer('ff')[1](self.tparams, x_img.mean(axis=0), prefix='ff_img_state_init', activ='tanh') elif self.init_cgru == 'textimg': # n_samples x conv_dim img_ctx_mean = x_img.mean(axis=0) # n_samples x ctx_dim text_ctx_mean = ( text_ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] # n_samples x (conv_dim + ctx_dim) mmodal_ctx = tensor.concatenate([img_ctx_mean, text_ctx_mean], axis=-1) init_state = get_new_layer('ff')[1](self.tparams, mmodal_ctx, prefix='ff_textimg_state_init', activ='tanh') else: init_state = tensor.alloc(0., n_samples, self.rnn_dim) ####################### # Source image features ####################### # Project image features to ctx_dim img_ctx = get_new_layer('ff')[1](self.tparams, x_img, prefix='ff_img_adaptor', activ='linear') # -> 196 x n_samples x ctx_dim #################### # Target embeddings #################### # Fetch target embeddings. Result is: (n_trg_timesteps x n_samples x embedding_dim) emb_trg = self.tparams['Wemb_dec'][y.flatten()] emb_trg = emb_trg.reshape( [n_timesteps_trg, n_samples, self.embedding_dim]) # Shift it to right to leave place for the <bos> placeholder # We ignore the last word <eos> as we don't condition on it at the end # to produce another word emb_trg_shifted = tensor.zeros_like(emb_trg) emb_trg_shifted = tensor.set_subtensor(emb_trg_shifted[1:], emb_trg[:-1]) emb_trg = emb_trg_shifted ########## # GRU Cond ########## # decoder - pass through the decoder conditional gru with attention dec_mult = self.gru_decoder(self.tparams, emb_trg, prefix='decoder_multi', input_mask=y_mask, ctx1=text_ctx, ctx1_mask=x_mask, ctx2=img_ctx, one_step=False, init_state=init_state) # gru_cond returns hidden state, weighted sum of context vectors and attentional weights. h = dec_mult[0] # (n_timesteps_trg, batch_size, rnn_dim) sumctx = dec_mult[ 1] # (n_timesteps_trg, batch_size, ctx*.shape[-1] (2000, 2*rnn_dim)) # weights (alignment matrix) self.alphas = list(dec_mult[2:]) # 3-way merge logit_gru = get_new_layer('ff')[1](self.tparams, h, prefix='ff_logit_gru', activ='linear') logit_ctx = get_new_layer('ff')[1](self.tparams, sumctx, prefix='ff_logit_ctx', activ='linear') # Dropout logit = dropout(tanh(logit_gru + emb_trg + logit_ctx), self.trng, self.out_dropout, self.use_dropout) if self.tied_trg_emb is False: logit = get_new_layer('ff')[1](self.tparams, logit, prefix='ff_logit', activ='linear') else: logit = tensor.dot(logit, self.tparams['Wemb_dec'].T) logit_shp = logit.shape # Apply logsoftmax (stable version) log_probs = -tensor.nnet.logsoftmax( logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]])) # cost y_flat = y.flatten() y_flat_idx = tensor.arange(y_flat.shape[0]) * self.n_words_trg + y_flat cost = log_probs.flatten()[y_flat_idx] cost = cost.reshape([n_timesteps_trg, n_samples]) cost = (cost * y_mask).sum(0) self.f_log_probs = theano.function(list(self.inputs.values()), cost) return cost
def test_grad_h(self): "tests that the gradients with respect to h_i are 0 after doing a mean field update of h_i " model = self.model e_step = self.e_step X = self.X assert X.shape[0] == self.m init_H = e_step.init_H_hat(V = X) init_Mu1 = e_step.init_S_hat(V = X) prev_setting = config.compute_test_value config.compute_test_value= 'off' H, Mu1 = function([], outputs=[init_H, init_Mu1])() config.compute_test_value = prev_setting H = broadcast(H, self.m) Mu1 = broadcast(Mu1, self.m) H = np.cast[config.floatX](self.model.rng.uniform(0.,1.,H.shape)) Mu1 = np.cast[config.floatX](self.model.rng.uniform(-5.,5.,Mu1.shape)) H_var = T.matrix(name='H_var') H_var.tag.test_value = H Mu1_var = T.matrix(name='Mu1_var') Mu1_var.tag.test_value = Mu1 idx = T.iscalar() idx.tag.test_value = 0 new_H = e_step.infer_H_hat(V = X, H_hat = H_var, S_hat = Mu1_var) h_idx = new_H[:,idx] updates_func = function([H_var,Mu1_var,idx], h_idx) sigma0 = 1. / model.alpha Sigma1 = e_step.infer_var_s1_hat() mu0 = T.zeros_like(model.mu) #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1 # (they don't affect the outcome of this test and some of them are intractable ) trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \ model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) grad_H = T.grad(trunc_kl.sum(), H_var) assert len(grad_H.type.broadcastable) == 2 #from theano.printing import min_informative_str #print min_informative_str(grad_H) #grad_H = Print('grad_H')(grad_H) #grad_H_idx = grad_H[:,idx] grad_func = function([H_var, Mu1_var], grad_H) failed = False for i in xrange(self.N): rval = updates_func(H, Mu1, i) H[:,i] = rval g = grad_func(H,Mu1)[:,i] assert not np.any(np.isnan(g)) g_abs_max = np.abs(g).max() if g_abs_max > self.tol: #print "new values of H" #print H[:,i] #print "gradient on new values of H" #print g failed = True print 'iteration ',i #print 'max value of new H: ',H[:,i].max() #print 'H for failing g: ' failing_h = H[np.abs(g) > self.tol, i] #print failing_h #from matplotlib import pyplot as plt #plt.scatter(H[:,i],g) #plt.show() #ignore failures extremely close to h=1 high_mask = failing_h > .001 low_mask = failing_h < .999 mask = high_mask * low_mask print 'masked failures: ',mask.shape[0],' err ',g_abs_max if mask.sum() > 0: print 'failing h passing the range mask' print failing_h[ mask.astype(bool) ] raise Exception('after mean field step, gradient of kl divergence' ' wrt freshly updated variational parameter should be 0, ' 'but here the max magnitude of a gradient element is ' +str(g_abs_max)+' after updating h_'+str(i))
def edge_potn(pdf, copula, theta, edges,Y=None, shared_copula=False): ''' ''' cdf = TT.extra_ops.cumsum(pdf,axis=2) cdf = TT.concatenate((TT.zeros_like(cdf[:,:,[0]]),cdf),axis=2) def comp_jpdf(cdf, d, y=None): ''' cdf : list of cdfs [cdf_1, cdf_2] y : list of vecotr of labels [y_1, y_1] ''' idx = TT.arange(cdf.shape[1]) if y: u_0 = cdf[0,idx,y[0]] u_1 = cdf[0,idx,y[0]+1] v_0 = cdf[1,idx,y[1]] v_1 = cdf[1,idx,y[1]+1] if shared_copula: pass else: d = d[y[0],y[1]] P = copula(u_0,v_0,d) P -= copula(u_0,v_1,d) P -= copula(u_1,v_0,d) P += copula(u_1,v_1,d) else: cdf_0 = TT.extra_ops.repeat(cdf[0].dimshuffle(0,1,'x'),cdf[1].shape[1],2) cdf_1 = TT.extra_ops.repeat(cdf[1].dimshuffle(0,'x',1),cdf[0].shape[1],1) if shared_copula: j_cdf = copula(cdf_0,cdf_1,d) P = j_cdf[:,1:,1:] + j_cdf[:,:-1,:-1] - j_cdf[:,:-1,1:] - j_cdf[:,1:,:-1] else: u11 = cdf_0[:,1:,1:] u01 = cdf_0[:,:-1,1:] u10 = cdf_0[:,1:,:-1] u00 = cdf_0[:,:-1,:-1] v11 = cdf_1[:,1:,1:] v01 = cdf_1[:,:-1,1:] v10 = cdf_1[:,1:,:-1] v00 = cdf_1[:,:-1,:-1] d = d.dimshuffle('x',0,1) d = TT.extra_ops.repeat(d,cdf.shape[1],0) uv_11 = copula(u11,v11,d) uv_00 = copula(u00,v00,d) uv_01 = copula(u01,v01,d) uv_10 = copula(u10,v10,d) P = uv_00+uv_11-uv_10-uv_01 return P cdf = cdf.dimshuffle(1,0,2) if Y != None: Y = Y.T.astype('int8') edges = edges.T.astype('int8') def inner_function(e, t, cdf): if Y == None: jpdf = comp_jpdf(cdf[e], t) else: jpdf = comp_jpdf(cdf[e], t, Y[e]) return jpdf # inner_function(edges[0],theta[0],cdf) jpdf , _ = T.scan( fn=inner_function, sequences=[edges, theta], non_sequences=[cdf] ) return -log_prob(jpdf)
def test_grad_s(self): "tests that the gradients with respect to s_i are 0 after doing a mean field update of s_i " model = self.model e_step = self.e_step X = self.X assert X.shape[0] == self.m model.test_batch_size = X.shape[0] init_H = e_step.init_H_hat(V = X) init_Mu1 = e_step.init_S_hat(V = X) prev_setting = config.compute_test_value config.compute_test_value= 'off' H, Mu1 = function([], outputs=[init_H, init_Mu1])() config.compute_test_value = prev_setting H = broadcast(H, self.m) Mu1 = broadcast(Mu1, self.m) H = np.cast[config.floatX](self.model.rng.uniform(0.,1.,H.shape)) Mu1 = np.cast[config.floatX](self.model.rng.uniform(-5.,5.,Mu1.shape)) H_var = T.matrix(name='H_var') H_var.tag.test_value = H Mu1_var = T.matrix(name='Mu1_var') Mu1_var.tag.test_value = Mu1 idx = T.iscalar() idx.tag.test_value = 0 S = e_step.infer_S_hat(V = X, H_hat = H_var, S_hat = Mu1_var) s_idx = S[:,idx] s_i_func = function([H_var,Mu1_var,idx],s_idx) sigma0 = 1. / model.alpha Sigma1 = e_step.infer_var_s1_hat() mu0 = T.zeros_like(model.mu) #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1 # (they don't affect the outcome of this test and some of them are intractable ) trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \ model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) grad_Mu1 = T.grad(trunc_kl.sum(), Mu1_var) grad_Mu1_idx = grad_Mu1[:,idx] grad_func = function([H_var, Mu1_var, idx], grad_Mu1_idx) for i in xrange(self.N): Mu1[:,i] = s_i_func(H, Mu1, i) g = grad_func(H,Mu1,i) assert not np.any(np.isnan(g)) g_abs_max = np.abs(g).max() if g_abs_max > self.tol: raise Exception('after mean field step, gradient of kl divergence wrt mean field parameter should be 0, but here the max magnitude of a gradient element is '+str(g_abs_max)+' after updating s_'+str(i))
def test_value_h(self): "tests that the value of the kl divergence decreases with each update to h_i " model = self.model e_step = self.e_step X = self.X assert X.shape[0] == self.m init_H = e_step.init_H_hat(V = X) init_Mu1 = e_step.init_S_hat(V = X) prev_setting = config.compute_test_value config.compute_test_value= 'off' H, Mu1 = function([], outputs=[init_H, init_Mu1])() config.compute_test_value = prev_setting H = broadcast(H, self.m) Mu1 = broadcast(Mu1, self.m) H = np.cast[config.floatX](self.model.rng.uniform(0.,1.,H.shape)) Mu1 = np.cast[config.floatX](self.model.rng.uniform(-5.,5.,Mu1.shape)) H_var = T.matrix(name='H_var') H_var.tag.test_value = H Mu1_var = T.matrix(name='Mu1_var') Mu1_var.tag.test_value = Mu1 idx = T.iscalar() idx.tag.test_value = 0 newH = e_step.infer_H_hat(V = X, H_hat = H_var, S_hat = Mu1_var) h_idx = newH[:,idx] h_i_func = function([H_var,Mu1_var,idx],h_idx) sigma0 = 1. / model.alpha Sigma1 = e_step.infer_var_s1_hat() mu0 = T.zeros_like(model.mu) #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1 # (they don't affect the outcome of this test and some of them are intractable ) trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \ model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) trunc_kl_func = function([H_var, Mu1_var], trunc_kl) for i in xrange(self.N): prev_kl = trunc_kl_func(H,Mu1) H[:,i] = h_i_func(H, Mu1, i) #we don't update mu, the whole point of the split e step is we don't have to new_kl = trunc_kl_func(H,Mu1) increase = new_kl - prev_kl print 'failures after iteration ',i,': ',(increase > self.tol).sum() mx = increase.max() if mx > 1e-4: print 'increase amounts of failing examples:' print increase[increase > self.tol] print 'failing H:' print H[increase > self.tol,:] print 'failing Mu1:' print Mu1[increase > self.tol,:] print 'failing V:' print X[increase > self.tol,:] raise Exception('after mean field step in h, kl divergence should decrease, but some elements increased by as much as '+str(mx)+' after updating h_'+str(i))
def experiment(state, channel): if state.test_model and 'config' in os.listdir('.'): print 'Loading local config file' config_file = open('config', 'r') config = config_file.readlines() try: config_vals = config[0].split('(')[1:][0].split(')')[:-1][0].split( ', ') except: config_vals = config[0][3:-1].replace(': ', '=').replace("'", "").split(', ') config_vals = filter( lambda x: not 'jobman' in x and not '/' in x and not ':' in x and not 'experiment' in x, config_vals) for CV in config_vals: print CV if CV.startswith('test'): print 'Do not override testing switch' continue try: exec('state.' + CV) in globals(), locals() except: exec('state.' + CV.split('=')[0] + "='" + CV.split('=')[1] + "'") in globals(), locals() else: # Save the current configuration # Useful for logs/experiments print 'Saving config' f = open('config', 'w') f.write(str(state)) f.close() print state # Load the data, train = train+valid, and shuffle train # Targets are not used (will be misaligned after shuffling train if state.dataset == 'MNIST': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = load_mnist(state.data_path) train_X = numpy.concatenate((train_X, valid_X)) elif state.dataset == 'MNIST_binary': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = load_mnist_binary(state.data_path) train_X = numpy.concatenate((train_X, valid_X)) elif state.dataset == 'TFD': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = load_tfd(state.data_path) N_input = train_X.shape[1] root_N_input = numpy.sqrt(N_input) numpy.random.seed(1) numpy.random.shuffle(train_X) train_X = theano.shared(train_X) valid_X = theano.shared(valid_X) test_X = theano.shared(test_X) # Theano variables and RNG X = T.fmatrix() index = T.lscalar() MRG = RNG_MRG.MRG_RandomStreams(1) # Network and training specifications K = state.K # N hidden layers N = state.N # number of walkbacks layer_sizes = [ N_input ] + [state.hidden_size ] * K # layer sizes, from h0 to hK (h0 is the visible layer) learning_rate = theano.shared(cast32(state.learning_rate)) # learning rate annealing = cast32(state.annealing) # exponential annealing coefficient momentum = theano.shared(cast32(state.momentum)) # momentum term # THEANO VARIABLES X = T.fmatrix() # Input of the graph index = T.lscalar() # index to minibatch MRG = RNG_MRG.MRG_RandomStreams(1) # PARAMETERS : weights list and bias list. # initialize a list of weights and biases based on layer_sizes weights_list = [ get_shared_weights( layer_sizes[i], layer_sizes[i + 1], numpy.sqrt(6. / (layer_sizes[i] + layer_sizes[i + 1])), 'W') for i in range(K) ] bias_list = [get_shared_bias(layer_sizes[i], 'b') for i in range(K + 1)] if state.test_model: # Load the parameters of the last epoch # maybe if the path is given, load these specific attributes param_files = filter(lambda x: 'params' in x, os.listdir('.')) max_epoch_idx = numpy.argmax( [int(x.split('_')[-1].split('.')[0]) for x in param_files]) params_to_load = param_files[max_epoch_idx] PARAMS = cPickle.load(open(params_to_load, 'r')) [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip(PARAMS[:len(weights_list)], weights_list) ] [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip(PARAMS[len(weights_list):], bias_list) ] # Util functions def dropout(IN, p=0.5): noise = MRG.binomial(p=p, n=1, size=IN.shape, dtype='float32') OUT = (IN * noise) / cast32(p) return OUT def add_gaussian_noise(IN, std=1): print 'GAUSSIAN NOISE : ', std noise = MRG.normal(avg=0, std=std, size=IN.shape, dtype='float32') OUT = IN + noise return OUT def corrupt_input(IN, p=0.5): # salt and pepper? masking? noise = MRG.binomial(p=p, n=1, size=IN.shape, dtype='float32') IN = IN * noise return IN def salt_and_pepper(IN, p=0.2): # salt and pepper noise print 'DAE uses salt and pepper noise' a = MRG.binomial(size=IN.shape, n=1, p=1 - p, dtype='float32') b = MRG.binomial(size=IN.shape, n=1, p=0.5, dtype='float32') c = T.eq(a, 0) * b return IN * a + c # Odd layer update function # just a loop over the odd layers def update_odd_layers(hiddens, noisy): for i in range(1, K + 1, 2): print i if noisy: simple_update_layer(hiddens, None, i) else: simple_update_layer(hiddens, None, i, add_noise=False) # Even layer update # p_X_chain is given to append the p(X|...) at each update (one update = odd update + even update) def update_even_layers(hiddens, p_X_chain, noisy): for i in range(0, K + 1, 2): print i if noisy: simple_update_layer(hiddens, p_X_chain, i) else: simple_update_layer(hiddens, p_X_chain, i, add_noise=False) # The layer update function # hiddens : list containing the symbolic theano variables [visible, hidden1, hidden2, ...] # layer_update will modify this list inplace # p_X_chain : list containing the successive p(X|...) at each update # update_layer will append to this list # add_noise : pre and post activation gaussian noise def simple_update_layer(hiddens, p_X_chain, i, add_noise=True): # Compute the dot product, whatever layer post_act_noise = 0 if i == 0: hiddens[i] = T.dot(hiddens[i + 1], weights_list[i].T) + bias_list[i] elif i == K: hiddens[i] = T.dot(hiddens[i - 1], weights_list[i - 1]) + bias_list[i] else: # next layer : layers[i+1], assigned weights : W_i # previous layer : layers[i-1], assigned weights : W_(i-1) hiddens[i] = T.dot(hiddens[i + 1], weights_list[i].T) + T.dot( hiddens[i - 1], weights_list[i - 1]) + bias_list[i] # Add pre-activation noise if NOT input layer if i == 1 and state.noiseless_h1: print '>>NO noise in first layer' add_noise = False # pre activation noise if i != 0 and add_noise: print 'Adding pre-activation gaussian noise' hiddens[i] = add_gaussian_noise(hiddens[i], state.hidden_add_noise_sigma) # ACTIVATION! if i == 0: print 'Sigmoid units' hiddens[i] = T.nnet.sigmoid(hiddens[i]) else: print 'Hidden units' hiddens[i] = hidden_activation(hiddens[i]) # post activation noise if i != 0 and add_noise: print 'Adding post-activation gaussian noise' hiddens[i] = add_gaussian_noise(hiddens[i], state.hidden_add_noise_sigma) # build the reconstruction chain if i == 0: # if input layer -> append p(X|...) p_X_chain.append(hiddens[i]) # sample from p(X|...) if state.input_sampling: print 'Sampling from input' sampled = MRG.binomial(p=hiddens[i], size=hiddens[i].shape, dtype='float32') else: print '>>NO input sampling' sampled = hiddens[i] # add noise sampled = salt_and_pepper(sampled, state.input_salt_and_pepper) # set input layer hiddens[i] = sampled def update_layers(hiddens, p_X_chain, noisy=True): print 'odd layer update' update_odd_layers(hiddens, noisy) print print 'even layer update' update_even_layers(hiddens, p_X_chain, noisy) ''' F PROP ''' #X = T.fmatrix() if state.act == 'sigmoid': print 'Using sigmoid activation' hidden_activation = T.nnet.sigmoid elif state.act == 'rectifier': print 'Using rectifier activation' hidden_activation = lambda x: T.maximum(cast32(0), x) elif state.act == 'tanh': hidden_activation = lambda x: T.tanh(x) ''' Corrupt X ''' X_corrupt = salt_and_pepper(X, state.input_salt_and_pepper) ''' hidden layer init ''' hiddens = [X_corrupt] p_X_chain = [] print "Hidden units initialization" for w, b in zip(weights_list, bias_list[1:]): # init with zeros print "Init hidden units at zero before creating the graph" hiddens.append(T.zeros_like(T.dot(hiddens[-1], w))) # The layer update scheme print "Building the graph :", N, "updates" for i in range(N): update_layers(hiddens, p_X_chain) # COST AND GRADIENTS print 'Cost w.r.t p(X|...) at every step in the graph' #COST = T.mean(T.nnet.binary_crossentropy(reconstruction, X)) COST = [T.mean(T.nnet.binary_crossentropy(rX, X)) for rX in p_X_chain] show_COST = COST[-1] COST = numpy.sum(COST) params = weights_list + bias_list gradient = T.grad(COST, params) gradient_buffer = [ theano.shared(numpy.zeros(x.get_value().shape, dtype='float32')) for x in params ] m_gradient = [ momentum * gb + (cast32(1) - momentum) * g for (gb, g) in zip(gradient_buffer, gradient) ] g_updates = [(p, p - learning_rate * mg) for (p, mg) in zip(params, m_gradient)] b_updates = zip(gradient_buffer, m_gradient) updates = OrderedDict(g_updates + b_updates) f_cost = theano.function(inputs=[X], outputs=show_COST) indexed_batch = train_X[index * state.batch_size:(index + 1) * state.batch_size] sampled_batch = MRG.binomial(p=indexed_batch, size=indexed_batch.shape, dtype='float32') f_learn = theano.function(inputs=[index], updates=updates, givens={X: indexed_batch}, outputs=show_COST) f_test = theano.function(inputs=[X], outputs=[X_corrupt] + hiddens[0] + p_X_chain, on_unused_input='warn') ############# # Denoise some numbers : show number, noisy number, reconstructed number ############# import random as R R.seed(1) random_idx = numpy.array(R.sample(range(len(test_X.get_value())), 100)) numbers = test_X.get_value()[random_idx] f_noise = theano.function(inputs=[X], outputs=salt_and_pepper( X, state.input_salt_and_pepper)) noisy_numbers = f_noise(test_X.get_value()[random_idx]) # Recompile the graph without noise for reconstruction function hiddens_R = [X] p_X_chain_R = [] for w, b in zip(weights_list, bias_list[1:]): # init with zeros hiddens_R.append(T.zeros_like(T.dot(hiddens_R[-1], w))) # The layer update scheme for i in range(N): update_layers(hiddens_R, p_X_chain_R, noisy=False) f_recon = theano.function(inputs=[X], outputs=p_X_chain_R[-1]) ############ # Sampling # ############ # the input to the sampling function network_state_input = [X] + [T.fmatrix() for i in range(K)] # "Output" state of the network (noisy) # initialized with input, then we apply updates #network_state_output = network_state_input network_state_output = [X] + network_state_input[1:] visible_pX_chain = [] # ONE update update_layers(network_state_output, visible_pX_chain, noisy=True) if K == 1: f_sample_simple = theano.function(inputs=[X], outputs=visible_pX_chain[-1]) # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn f_sample2 = theano.function(inputs=network_state_input, outputs=network_state_output + visible_pX_chain, on_unused_input='warn') def sample_some_numbers_single_layer(): x0 = test_X.get_value()[:1] samples = [x0] x = f_noise(x0) for i in range(399): x = f_sample_simple(x) samples.append(x) x = numpy.random.binomial(n=1, p=x, size=x.shape).astype('float32') x = f_noise(x) return numpy.vstack(samples) def sampling_wrapper(NSI): out = f_sample2(*NSI) NSO = out[:len(network_state_output)] vis_pX_chain = out[len(network_state_output):] return NSO, vis_pX_chain def sample_some_numbers(N=400): # The network's initial state init_vis = test_X.get_value()[:1] noisy_init_vis = f_noise(init_vis) network_state = [[noisy_init_vis] + [ numpy.zeros((1, len(b.get_value())), dtype='float32') for b in bias_list[1:] ]] visible_chain = [init_vis] noisy_h0_chain = [noisy_init_vis] for i in range(N - 1): # feed the last state into the network, compute new state, and obtain visible units expectation chain net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1]) # append to the visible chain visible_chain += vis_pX_chain # append state output to the network state chain network_state.append(net_state_out) noisy_h0_chain.append(net_state_out[0]) return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain) def plot_samples(epoch_number): to_sample = time.time() if K == 1: # one layer model V = sample_some_numbers_single_layer() else: V, H0 = sample_some_numbers() img_samples = PIL.Image.fromarray( tile_raster_images(V, (root_N_input, root_N_input), (20, 20))) fname = 'samples_epoch_' + str(epoch_number) + '.png' img_samples.save(fname) print 'Took ' + str(time.time() - to_sample) + ' to sample 400 numbers' ############## # Inpainting # ############## def inpainting(digit): # The network's initial state # NOISE INIT init_vis = cast32(numpy.random.uniform(size=digit.shape)) #noisy_init_vis = f_noise(init_vis) #noisy_init_vis = cast32(numpy.random.uniform(size=init_vis.shape)) # INDEXES FOR VISIBLE AND NOISY PART noise_idx = (numpy.arange(N_input) % root_N_input < (root_N_input / 2)) fixed_idx = (numpy.arange(N_input) % root_N_input > (root_N_input / 2)) # function to re-init the visible to the same noise # FUNCTION TO RESET HALF VISIBLE TO DIGIT def reset_vis(V): V[0][fixed_idx] = digit[0][fixed_idx] return V # INIT DIGIT : NOISE and RESET HALF TO DIGIT init_vis = reset_vis(init_vis) network_state = [[init_vis] + [ numpy.zeros((1, len(b.get_value())), dtype='float32') for b in bias_list[1:] ]] visible_chain = [init_vis] noisy_h0_chain = [init_vis] for i in range(49): # feed the last state into the network, compute new state, and obtain visible units expectation chain net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1]) # reset half the digit net_state_out[0] = reset_vis(net_state_out[0]) vis_pX_chain[0] = reset_vis(vis_pX_chain[0]) # append to the visible chain visible_chain += vis_pX_chain # append state output to the network state chain network_state.append(net_state_out) noisy_h0_chain.append(net_state_out[0]) return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain) def save_params(n, params): print 'saving parameters...' save_path = 'params_epoch_' + str(n) + '.pkl' f = open(save_path, 'wb') try: cPickle.dump(params, f, protocol=cPickle.HIGHEST_PROTOCOL) finally: f.close() # TRAINING n_epoch = state.n_epoch batch_size = state.batch_size STOP = False counter = 0 train_costs = [] valid_costs = [] test_costs = [] if state.vis_init: bias_list[0].set_value( logit(numpy.clip(0.9, 0.001, train_X.get_value().mean(axis=0)))) if state.test_model: # If testing, do not train and go directly to generating samples, parzen window estimation, and inpainting print 'Testing : skip training' STOP = True while not STOP: counter += 1 t = time.time() print counter, '\t', #train train_cost = [] for i in range(len(train_X.get_value(borrow=True)) / batch_size): #train_cost.append(f_learn(train_X[i * batch_size : (i+1) * batch_size])) #training_idx = numpy.array(range(i*batch_size, (i+1)*batch_size), dtype='int32') train_cost.append(f_learn(i)) train_cost = numpy.mean(train_cost) train_costs.append(train_cost) print 'Train : ', trunc(train_cost), '\t', #valid valid_cost = [] for i in range(len(valid_X.get_value(borrow=True)) / 100): valid_cost.append( f_cost(valid_X.get_value()[i * 100:(i + 1) * batch_size])) valid_cost = numpy.mean(valid_cost) #valid_cost = 123 valid_costs.append(valid_cost) print 'Valid : ', trunc(valid_cost), '\t', #test test_cost = [] for i in range(len(test_X.get_value(borrow=True)) / 100): test_cost.append( f_cost(test_X.get_value()[i * 100:(i + 1) * batch_size])) test_cost = numpy.mean(test_cost) test_costs.append(test_cost) print 'Test : ', trunc(test_cost), '\t', if counter >= n_epoch: STOP = True print 'time : ', trunc(time.time() - t), print 'MeanVisB : ', trunc(bias_list[0].get_value().mean()), print 'W : ', [ trunc(abs(w.get_value(borrow=True)).mean()) for w in weights_list ] if (counter % 5) == 0: # Checking reconstruction reconstructed = f_recon(noisy_numbers) # Concatenate stuff stacked = numpy.vstack([ numpy.vstack([ numbers[i * 10:(i + 1) * 10], noisy_numbers[i * 10:(i + 1) * 10], reconstructed[i * 10:(i + 1) * 10] ]) for i in range(10) ]) number_reconstruction = PIL.Image.fromarray( tile_raster_images(stacked, (root_N_input, root_N_input), (10, 30))) #epoch_number = reduce(lambda x,y : x + y, ['_'] * (4-len(str(counter)))) + str(counter) number_reconstruction.save('number_reconstruction' + str(counter) + '.png') #sample_numbers(counter, 'seven') plot_samples(counter) #save params save_params(counter, params) # ANNEAL! new_lr = learning_rate.get_value() * annealing learning_rate.set_value(new_lr) # Save state.train_costs = train_costs state.valid_costs = valid_costs state.test_costs = test_costs # if test # 10k samples print 'Generating 10,000 samples' samples, _ = sample_some_numbers(N=10000) f_samples = 'samples.npy' numpy.save(f_samples, samples) print 'saved digits' # parzen print 'Evaluating parzen window' import likelihood_estimation_parzen likelihood_estimation_parzen.main(0.20, 'mnist') # Inpainting print 'Inpainting' test_X = test_X.get_value() numpy.random.seed(2) test_idx = numpy.arange(len(test_Y)) for Iter in range(10): numpy.random.shuffle(test_idx) test_X = test_X[test_idx] test_Y = test_Y[test_idx] digit_idx = [(test_Y == i).argmax() for i in range(10)] inpaint_list = [] for idx in digit_idx: DIGIT = test_X[idx:idx + 1] V_inpaint, H_inpaint = inpainting(DIGIT) inpaint_list.append(V_inpaint) INPAINTING = numpy.vstack(inpaint_list) plot_inpainting = PIL.Image.fromarray( tile_raster_images(INPAINTING, (root_N_input, root_N_input), (10, 50))) fname = 'inpainting_' + str(Iter) + '.png' #fname = os.path.join(state.model_path, fname) plot_inpainting.save(fname) if False and __name__ == "__main__": os.system('eog inpainting.png') if __name__ == '__main__': import ipdb ipdb.set_trace() return
def test_value_s(self): "tests that the value of the kl divergence decreases with each update to s_i " model = self.model e_step = self.e_step X = self.X assert X.shape[0] == self.m init_H = e_step.init_H_hat(V = X) init_Mu1 = e_step.init_S_hat(V = X) prev_setting = config.compute_test_value config.compute_test_value= 'off' H, Mu1 = function([], outputs=[init_H, init_Mu1])() config.compute_test_value = prev_setting H = broadcast(H, self.m) Mu1 = broadcast(Mu1, self.m) H = np.cast[config.floatX](self.model.rng.uniform(0.,1.,H.shape)) Mu1 = np.cast[config.floatX](self.model.rng.uniform(-5.,5.,Mu1.shape)) H_var = T.matrix(name='H_var') H_var.tag.test_value = H Mu1_var = T.matrix(name='Mu1_var') Mu1_var.tag.test_value = Mu1 idx = T.iscalar() idx.tag.test_value = 0 S = e_step.infer_S_hat( V = X, H_hat = H_var, S_hat = Mu1_var) s_idx = S[:,idx] s_i_func = function([H_var,Mu1_var,idx],s_idx) sigma0 = 1. / model.alpha Sigma1 = e_step.infer_var_s1_hat() mu0 = T.zeros_like(model.mu) #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1 # (they don't affect the outcome of this test and some of them are intractable ) trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \ model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) trunc_kl_func = function([H_var, Mu1_var], trunc_kl) for i in xrange(self.N): prev_kl = trunc_kl_func(H,Mu1) Mu1[:,i] = s_i_func(H, Mu1, i) new_kl = trunc_kl_func(H,Mu1) increase = new_kl - prev_kl mx = increase.max() if mx > 1e-3: raise Exception('after mean field step in s, kl divergence should decrease, but some elements increased by as much as '+str(mx)+' after updating s_'+str(i))
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, dim, mode, input_mask_mode, memory_hops, l2, normalize_attention, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops #self.batch_size = 1 self.l2 = l2 self.normalize_attention = normalize_attention self.train_input, self.train_q, self.train_answer, self.train_choices, self.train_input_mask = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_choices, self.test_input_mask = self._process_input( babi_test_raw) self.vocab_size = 4 # number of answer choices self.inp_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.ca_var = T.matrix('ca_var') self.cb_var = T.matrix('cb_var') self.cc_var = T.matrix('cc_var') self.cd_var = T.matrix('cd_var') self.ans_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') print "==> building input module" self.W_inp_res_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.word_vector_size)), borrow=True) self.W_inp_res_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_inp_res = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_inp_upd_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.word_vector_size)), borrow=True) self.W_inp_upd_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_inp_upd = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_inp_hid_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.word_vector_size)), borrow=True) self.W_inp_hid_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_inp_hid = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.inp_var, outputs_info=T.zeros_like( self.b_inp_hid)) self.inp_c = inp_c_history.take(self.input_mask_var, axis=0) self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] self.c_vecs = [] for choice in [self.ca_var, self.cb_var, self.cc_var, self.cd_var]: history, _ = theano.scan(fn=self.input_gru_step, sequences=choice, outputs_info=T.zeros_like(self.b_inp_hid)) self.c_vecs.append(history[-1]) self.c_vecs = T.stack(self.c_vecs).transpose((1, 0)) # (dim, 4) self.inp_c = T.stack([self.inp_c] * 4).transpose( (1, 2, 0)) # (fact_cnt, dim, 4) self.q_q = T.stack([self.q_q] * 4).transpose((1, 0)) # (dim, 4) print "==> creating parameters for memory module" self.W_mem_res_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.W_mem_res_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_mem_res = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_mem_upd_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.W_mem_upd_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_mem_upd = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_mem_hid_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.W_mem_hid_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_mem_hid = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_b = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.W_1 = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, 10 * self.dim + 3)), borrow=True) self.W_2 = theano.shared(lasagne.init.Normal(0.1).sample( (1, self.dim)), borrow=True) self.b_1 = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.b_2 = theano.shared(lasagne.init.Constant(0.0).sample((1, )), borrow=True) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] # (dim, 4) for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update_batch(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem = memory[-1].flatten() print "==> building answer module" self.W_a = theano.shared(lasagne.init.Normal(0.1).sample( (self.vocab_size, 4 * self.dim)), borrow=True) self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) print "==> collecting all parameters" self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy( self.prediction.dimshuffle('x', 0), T.stack([self.ans_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.inp_var, self.q_var, self.ans_var, self.ca_var, self.cb_var, self.cc_var, self.cd_var, self.input_mask_var ], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[ self.inp_var, self.q_var, self.ans_var, self.ca_var, self.cb_var, self.cc_var, self.cd_var, self.input_mask_var ], outputs=[ self.prediction, self.loss, self.inp_c, self.q_q, last_mem ]) if self.mode == 'train': print "==> computing gradients (for debugging)" gradient = T.grad(self.loss, self.params) self.get_gradient_fn = theano.function(inputs=[ self.inp_var, self.q_var, self.ans_var, self.ca_var, self.cb_var, self.cc_var, self.cd_var, self.input_mask_var ], outputs=gradient)
def __init__(self, inpt, wts, centers, rand_gen=None, n_in=None, n_features=None, n_classes=None, kind='LOGIT', learn_centers=False, junk_dist=np.inf, reg=()): # wts (n_in x n_features) # centers (n_classesx n_features) assert kind in activs assert n_in or wts assert n_features or wts or centers assert n_classes or centers assert kind == 'RBF' or not learn_centers HiddenLayer.__init__(self, inpt, wts, rand_gen, n_in, n_out=n_features, actvn=activs[kind], pdrop=0, reg=reg) # Initialize centers if centers is None: if kind == 'LOGIT': centers_vals = rand_gen.binomial(n=1, p=.5, size=(n_classes, n_features)) elif kind == 'RBF': centers_vals = rand_gen.uniform(low=0, high=1, size=(n_classes, n_features)) centers = np.asarray(centers_vals, dtype=float_x) if is_shared_var(centers): self.centers = centers else: self.centers = th.shared(centers, name='centers', borrow=True) if learn_centers: self.params.append(self.centers) # Populate various n's based on weights if not n_in or not n_features: n_in, n_features = borrow(self.w).shape if not n_features or not n_classes: n_classes, n_features = borrow(self.centers).shape # c = centers; v = output of hidden layer = calculated features self.features = self.output # Refers to the output of HiddenLayer c = self.centers.dimshuffle('x', 0, 1) v = self.features.dimshuffle(0, 'x', 1) self.kind = kind self.junk_dist = junk_dist if kind == 'LOGIT': # BATCH_SZ x nClasses x nFeatures >> BATCH_SZ x nClasses >> BATCH_SZ epsilon = .001 v = v * (1 - 2 * epsilon) + epsilon self.bitprob = c * v + (1 - c) * (1 - v) self.logprob = tt.sum(tt.log(self.bitprob), axis=2) # if imp == None \ # else T.tensordot(T.log(self.bitprob), imp, axes=([2, 0])) self.y_preds = tt.argmax(self.logprob, axis=1) elif kind == 'RBF': dists = tt.sum((v - c)**2, axis=2) # BATCH_SZ x nClasses junk_col = junk_dist + tt.zeros_like(dists[:, 1]).dimshuffle( 0, 'x') self.dists = tt.concatenate([dists, junk_col], axis=1) self.probs = tt.nnet.softmax(-self.dists) # BATCH_SZ x nClasses+1 self.logprob = tt.log(self.probs) self.y_preds = tt.argmax(self.probs, axis=1) self.representation = ( 'CenteredOut Kind:{} In:{:3d} Hidden:{:3d} ' 'Out:{:3d} learn_centers:{} junk_dist:{}'.format( kind, n_in, n_features, n_classes, learn_centers, junk_dist))
def __call__(self, x, y, qk=None, n_posterior_samples=10, pass_gradients=False, reweight=False, reweight_gen_only=False, sleep_phase=False): '''Call function. Calculates the lower bound, log marginal, and other useful quantities. If this is TMI for your needs, just omit what you don't need from the final graph. Args: x: T.tensor, input to recogntion network. y: T.tensor, output from conditional. qk: T.tensor (optional), approximate posterior parameters. If None, calculate from recognition network. n_posterior_samples: int, number of samples to use for lower bound and log marginal estimates. pass_gradients: bool, for priors with continuous distributions, this can facilitate learning. Otherwise, q_k should be provided. reweight: bool. If true, then reweight samples for estimates. Returns: results: OrderedDict, float results. samples: OrderedDict, array results (such as samples from conditional). updates: OrderedUpdates. constants: list, for omitting quantities from passing gradients. ''' constants = [] results = OrderedDict() q0 = self.posterior.feed(x) if qk is None: qk = q0 elif not pass_gradients: constants.append(qk) r = self.init_inference_samples( (n_posterior_samples, y.shape[0], self.dim_h)) h = self.posterior.distribution.step_sample(r, qk[None, :, :]) py_h = self.conditional.feed(h) log_py_h = -self.conditional.neg_log_prob(y[None, :, :], py_h) log_ph = -self.prior.neg_log_prob(h) log_qh0 = -self.posterior.neg_log_prob(h, q0[None, :, :]) log_qhk = -self.posterior.neg_log_prob(h, qk[None, :, :]) prior_entropy = self.prior.entropy() q_entropy = self.posterior.entropy(qk) # Log marginal log_p = log_sum_exp(log_py_h + log_ph - log_qhk, axis=0) - T.log(n_posterior_samples) recon_term = -log_py_h # Some prior distributions have a tractable KL divergence. if self.prior.has_kl and not reweight and not reweight_gen_only: KL_qk_p = self.prior.kl_divergence(qk) results['KL(q_k||p)'] = KL_qk_p KL_term = KL_qk_p else: prior_energy = -log_ph results['-log p(h)'] = prior_energy.mean() KL_term = prior_energy - q_entropy # If we pass the gradients we don't want to include the KL(q_k||q_0) if not pass_gradients: if self.posterior.distribution.has_kl and not reweight and not reweight_gen_only: KL_qk_q0 = self.posterior.distribution.step_kl_divergence( qk, *self.posterior.distribution.split_prob(q0)) results['KL(q_k||q_0)'] = KL_qk_q0 posterior_term = KL_qk_q0 else: results['-log q(h)'] = -log_qh0.mean() posterior_term = -log_qh0 else: posterior_term = T.zeros_like(log_qh0) lower_bound = -(recon_term + KL_term).mean() w_tilde = get_w_tilde(log_py_h + log_ph - log_qhk) results['log ESS'] = T.log(1. / (w_tilde**2).sum(0)).mean() if sleep_phase: r = self.init_inference_samples( (n_posterior_samples, y.shape[0], self.dim_h)) h_s = self.prior.step_sample( r, self.prior.get_prob(*self.prior.get_params())) py_h_s = self.conditional.feed(h_s) y_s, _ = self.conditional.sample(py_h_s) constants.append(y_s) q0_s = self.posterior.feed(y_s[0]) log_qh0 = -self.posterior.neg_log_prob(h_s, q0_s) cost = -((w_tilde * (log_py_h + log_ph)).sum( (0, 1)) + log_qh0.sum(1).mean(0)) constants.append(w_tilde) elif reweight: cost = -(w_tilde * (log_py_h + log_ph + log_qh0)).sum((0, 1)) constants.append(w_tilde) elif reweight_gen_only: cost = -((w_tilde * (log_py_h + log_ph)).sum( (0, 1)) + log_qh0.sum(1).mean(0)) constants.append(w_tilde) else: cost = (recon_term + KL_term + posterior_term).sum(1).mean(0) results.update( **{ '-log p(x|h)': recon_term.mean(), '-log p(x)': -log_p.mean(0), 'H(p)': prior_entropy, 'H(q)': q_entropy.mean(0), 'lower_bound': lower_bound, 'cost': cost }) samples = OrderedDict(py=py_h, batch_energies=recon_term, w_tilde=w_tilde) return results, samples, constants, theano.OrderedUpdates()
def __init__(self, size, a=0.1, b=0.2, c=-65.0, d=2.0): self.scheduler = Scheduler(size) self.size = size v_peak = 30.0 tau = 0.5 self.v = v = theano.shared(np.full(size, c, dtype=floatX), name="v", borrow=True) self.u = u = theano.shared(np.full(size, b * c, dtype=floatX), name="u", borrow=True) self.I = I = theano.shared(np.zeros(size, dtype=floatX), name="I", borrow=True) dv = tau * (0.04 * (v * v) + (v * 5.0) + 140.0 - u + I) du = tau * (a * ((b * v) - u)) now = T.iscalar("now") DC = T.vector("DC") spikes = T.vector("spikes") schedule = T.vector("schedule") self.recv = theano.function([DC, schedule], I, updates=[(I, I + DC + schedule)]) self.tick_v = theano.function([], v, updates=[(v, v + dv)]) self.tick_u = theano.function([], u, updates=[(u, u + du)]) self.threshold = theano.function([], v >= v_peak) self.reset = theano.function([spikes], [v, u, I], updates=[ (v, T.switch(spikes, c, v)), (u, T.switch(spikes, u + d, u)), (I, T.zeros_like(I)), ]) window_size = 40 rate_mul = 1000.0 / window_size self.spike_counter = spike_counter = theano.shared( np.zeros((window_size, size), dtype=floatX), name="spike_counter", borrow=True) self.rate = rate = theano.shared(np.zeros(size, dtype=floatX), name="rate", borrow=True) self.count_spikes = theano.function( [now, spikes], spike_counter, updates=[(spike_counter, T.set_subtensor(spike_counter[now % window_size], spikes))], name="count_spikes") self.sum_rate = theano.function( [], rate, updates=[(rate, T.sum(spike_counter, axis=0) * rate_mul)])