示例#1
0
文件: rnn.py 项目: ddofer/breze
def lstm_layer(hidden_inpt, hidden_to_hidden,
               ingate_peephole, outgate_peephole, forgetgate_peephole,
               f):
    n_hidden_out = hidden_to_hidden.shape[0]

    def lstm_step(x_t, s_tm1, h_tm1):
        x_t += T.dot(h_tm1, hidden_to_hidden)

        inpt = T.tanh(x_t[:, :n_hidden_out])
        gates = x_t[:, n_hidden_out:]
        inpeep = s_tm1 * ingate_peephole
        outpeep = s_tm1 * outgate_peephole
        forgetpeep = s_tm1 * forgetgate_peephole

        ingate = f(gates[:, :n_hidden_out] + inpeep)
        forgetgate = f(
            gates[:, n_hidden_out:2 * n_hidden_out] + forgetpeep)
        outgate = f(gates[:, 2 * n_hidden_out:] + outpeep)

        s_t = inpt * ingate + s_tm1 * forgetgate
        h_t = f(s_t) * outgate
        return [s_t, h_t]

    (states, hidden_rec), _ = theano.scan(
        lstm_step,
        sequences=hidden_inpt,
        outputs_info=[T.zeros_like(hidden_inpt[0, :, 0:n_hidden_out]),
                      T.zeros_like(hidden_inpt[0, :, 0:n_hidden_out])
                      ])

    return states, hidden_rec
示例#2
0
文件: solver.py 项目: dfm/exoplanet
    def grad(self, inputs, gradients):
        M, e = inputs
        E, f = self(M, e)

        bM = tt.zeros_like(M)
        be = tt.zeros_like(M)
        ecosE = e * tt.cos(E)

        if not isinstance(gradients[0].type, theano.gradient.DisconnectedType):
            # Backpropagate E_bar
            bM = gradients[0] / (1 - ecosE)
            be = tt.sin(E) * bM

        if not isinstance(gradients[1].type, theano.gradient.DisconnectedType):
            # Backpropagate f_bar
            sinf2 = tt.sin(0.5*f)
            cosf2 = tt.cos(0.5*f)
            tanf2 = sinf2 / cosf2
            e2 = e**2
            ome2 = 1 - e2
            ome = 1 - e
            ope = 1 + e
            cosf22 = cosf2**2
            twoecosf22 = 2 * e * cosf22
            factor = tt.sqrt(ope/ome)
            inner = (twoecosf22+ome) * tt.as_tensor_variable(gradients[1])

            bM += factor*(ome*tanf2**2+ope)*inner*cosf22/(ope*ome2)
            be += -2*cosf22*tanf2/ome2**2*inner*(ecosE-2+e2)

        return [bM, be]
示例#3
0
    def get_output(self,y,y_mask,init_state,train=False):
        X=self.get_input(train)  
        X_mask=self.previous.x_mask
        X = X.dimshuffle((1, 0, 2))
        X_mask = X_mask.dimshuffle((1, 0))  
        
        y=y.dimshuffle((1, 0, 2))
        y_mask=y_mask.dimshuffle((1, 0))  
        
        ### shift 1 sequence backward
        y_shifted=T.zeros_like(y)
        y_shifted=T.set_subtensor(y_shifted[1:],y[:-1])
        y=y_shifted 

        ### shift 1 sequence backward
        y_shifted=T.zeros_like(y_mask)
        y_shifted=T.set_subtensor(y_shifted[1:],y_mask[:-1])
        y_mask=y_shifted 

        y_z = T.dot(y, self.W_z) + self.b_z
        y_r = T.dot(y, self.W_r) + self.b_r
        y_h = T.dot(y, self.W_h) + self.b_h       
        
        
        
        [h,logit], _ = theano.scan(self._step, 
                                     sequences = [y,y_z,y_r,y_h,y_mask],
                                     outputs_info = [init_state,
                                                     None],
                                     non_sequences=[X,X_mask])

        return logit.dimshuffle((1, 0, 2))
示例#4
0
 def _construct_compute_ll_bound(self):
     """
     Construct a function for computing the variational likelihood bound.
     """
     # setup some symbolic variables for theano to deal with
     Xd = T.matrix()
     Xc = T.zeros_like(Xd)
     Xm = T.zeros_like(Xd)
     # get symbolic var for posterior KLds
     post_kld = self.IN.kld_cost
     # get symbolic var for log likelihoods
     if self.use_encoder:
         log_likelihood = self.GN.compute_log_prob(self.IN.Xd_encoded)
     else:
         log_likelihood = self.GN.compute_log_prob(self.IN.Xd)
     # construct a theano function for actually computing stuff
     outputs = [post_kld, log_likelihood]
     out_func = theano.function([Xd], outputs=outputs, \
             givens={ self.Xd: Xd, self.Xc: Xc, self.Xm: Xm })
     # construct a function for computing multi-sample averages
     def multi_sample_bound(X, sample_count=10):
         post_klds = np.zeros((X.shape[0], 1))
         log_likelihoods = np.zeros((X.shape[0], 1))
         max_lls = np.zeros((X.shape[0], 1)) - 1e8
         for i in range(sample_count):
             result = out_func(X)
             post_klds = post_klds + (1.0 * result[0])
             log_likelihoods = log_likelihoods + (1.0 * result[1])
             max_lls = np.maximum(max_lls, (1.0 * result[1]))
         post_klds = post_klds / sample_count
         log_likelihoods = log_likelihoods / sample_count
         ll_bounds = log_likelihoods - post_klds
         return [ll_bounds, post_klds, log_likelihoods, max_lls]
     return multi_sample_bound
 def _construct_sample_from_prior(self):
     """
     Construct a function for drawing independent samples from the
     distribution generated by this MultiStageModel. This function returns
     the full sequence of "partially completed" examples.
     """
     z_sym = T.matrix()
     x_sym = T.matrix()
     irs = self.ir_steps
     oputs = [self.obs_transform(self.s0)]
     oputs.extend([self.obs_transform(self.si[i]) for i in range(irs)])
     _, hi_zmuv = self._construct_zmuv_samples(x_sym, 1)
     sample_func = theano.function(inputs=[z_sym, x_sym], outputs=oputs, \
             givens={ self.z: z_sym, \
                      self.x_in: T.zeros_like(x_sym), \
                      self.x_out: T.zeros_like(x_sym), \
                      self.hi_zmuv: hi_zmuv }, \
             updates=self.scan_updates)
     def prior_sampler(samp_count):
         x_samps = to_fX( np.zeros((samp_count, self.obs_dim)) )
         old_switch = self.train_switch.get_value(borrow=False)
         # set model to generation mode
         self.set_train_switch(switch_val=0.0)
         z_samps = to_fX( npr.randn(samp_count, self.z_dim) )
         model_samps = sample_func(z_samps, x_samps)
         # set model back to either training or generation mode
         self.set_train_switch(switch_val=old_switch)
         return model_samps
     return prior_sampler
示例#6
0
 def rnade_sym(self,x,W,V_alpha,b_alpha,V_mu,b_mu,V_sigma,b_sigma,activation_rescaling):
     """ x is a matrix of column datapoints (VxB) V = n_visible, B = batch size """
     def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma,activation_factor, p_prev, a_prev, x_prev,):
         a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1))
         h = self.nonlinearity(a * activation_factor)  # BxH
         #x = theano.printing.Print('x')(x)
         Alpha = T.nnet.softmax(T.dot(h, V_alpha) + T.shape_padleft(b_alpha))  # BxC
         Alpha = theano.printing.Print('Alphas')(Alpha)
         Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu)  # BxC
         Mu = theano.printing.Print('Mu')(Mu)
         Sigma = T.exp((T.dot(h, V_sigma) + T.shape_padleft(b_sigma)))  # BxC
         Sigma = theano.printing.Print('Sigmas')(Sigma)
         arg = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x, 1)) / Sigma) - T.log(Sigma) - constantX(0.5 * numpy.log(2 * numpy.pi)) + T.log(Alpha)
         arg = theano.printing.Print('printing argument of logsumexp')(arg)
         p_var = log_sum_exp(arg)
         p_var = theano.printing.Print('p_var')(p_var)
         p = p_prev + p_var
         #p = theano.printing.Print('p')(p)
         return (p, a, x)
     # First element is different (it is predicted from the bias only)
     a0 = T.zeros_like(T.dot(x.T, W))  # BxH
     p0 = T.zeros_like(x[0])
     x0 = T.ones_like(x[0])    
     ([ps, _as, _xs], updates) = theano.scan(density_given_previous_a_and_x,
                                             sequences=[x, W, V_alpha, b_alpha,V_mu,b_mu,V_sigma,b_sigma,activation_rescaling],
                                             outputs_info=[p0, a0, x0])
     return (ps[-1], updates)
示例#7
0
def test_gpujoin_gpualloc():
    a = T.fmatrix('a')
    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
    b = T.fmatrix('b')
    b_val = numpy.asarray(numpy.random.rand(3, 5), dtype='float32')

    f = theano.function([a, b], T.join(0, T.zeros_like(a),T.ones_like(b)) + 4,
                        mode=mode_without_gpu)
    f_gpu = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)),
                            mode=mode_with_gpu)
    f_gpu2 = theano.function([a, b], T.join(0, T.zeros_like(a),
                                           T.ones_like(b)) + 4,
                             mode=mode_with_gpu)

    assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 2
    assert sum([node.op == T.join for node in f.maker.env.toposort()]) == 1
    assert sum([node.op == B.gpu_alloc
                for node in f_gpu.maker.env.toposort()]) == 2
    assert sum([node.op == B.gpu_join
                for node in f_gpu.maker.env.toposort()]) == 1
    assert sum([node.op == B.gpu_alloc
                for node in f_gpu2.maker.env.toposort()]) == 2
    assert sum([node.op == B.gpu_join
                for node in f_gpu2.maker.env.toposort()]) == 1
    assert numpy.allclose(f(a_val, b_val), f_gpu2(a_val, b_val))
示例#8
0
	def create_cost_fun (self):

		# create a cost function that
		# takes each prediction at every timestep
		# and guesses next timestep's value:
		what_to_predict = self.input_mat[:, 1:]
		# because some sentences are shorter, we
		# place masks where the sentences end:
		# (for how long is zero indexed, e.g. an example going from `[2,3)`)
		# has this value set 0 (here we substract by 1):
		for_how_long = self.for_how_long - 1
		# all sentences start at T=0:
		starting_when = T.zeros_like(self.for_how_long)
								 
		self.lstm_cost = masked_loss(self.lstm_predictions,
								what_to_predict,
								for_how_long,
								starting_when).sum()

		zero_entropy = T.zeros_like(self.entropy)
		real_entropy = T.switch(self.mask_matrix,self.entropy,zero_entropy)
		zero_key_entropy = T.zeros_like(self.key_entropy)
		real_key_entropy = T.switch(self.mask_matrix,self.key_entropy,zero_key_entropy)

		self.final_cost = masked_loss(self.final_predictions,
								what_to_predict,
								for_how_long,
								starting_when).sum()+self.entropy_reg*real_entropy.sum()+self.key_entropy_reg*real_key_entropy.sum()
示例#9
0
    def mf(self, V, Y = None, return_history = False, niter = None, block_grad = None):

        drop_mask = T.zeros_like(V)

        if Y is not None:
            drop_mask_Y = T.zeros_like(Y)
        else:
            batch_size = V.shape[0]
            num_classes = self.dbm.hidden_layers[-1].n_classes
            assert isinstance(num_classes, int)
            Y = T.alloc(1., V.shape[0], num_classes)
            drop_mask_Y = T.alloc(1., V.shape[0])

        history = self.do_inpainting(X=V,
            Y=Y,
            return_history=True,
            drop_mask=drop_mask,
            drop_mask_Y=drop_mask_Y,
            noise=False,
            niter=niter,
            block_grad=block_grad)

        if return_history:
            return [elem['H_hat'] for elem in history]

        return history[-1]['H_hat']
示例#10
0
def T_subspacel1_slow_shrinkage(a,L,lam_sparse,lam_slow,small_value=.001):
    amp = T.sqrt(a[::2,:]**2 + a[1::2,:]**2 + small_value)
    #damp = amp[:,1:] - amp[:,:-1]

    # compose slow shrinkage with subspace l1 shrinkage

    # slow shrinkage
    div = T.zeros_like(amp)
    d1 = amp[:,1:] - amp[:,:-1]
    d2 = d1[:,1:] - d1[:,:-1]
    div = T.set_subtensor(div[:,1:-1],-d2)
    div = T.set_subtensor(div[:,0], -d1[:,0])
    div = T.set_subtensor(div[:,-1], d1[:,-1])
    slow_amp_shrinkage = 1 - (lam_slow/L)*(div/amp)
    slow_amp_value = T.switch(T.gt(slow_amp_shrinkage,0),slow_amp_shrinkage,0)
    slow_shrinkage_prox_a = slow_amp_value*a[::2,:]
    slow_shrinkage_prox_b = slow_amp_value*a[1::2,:]

    # subspace l1 shrinkage
    amp_slow_shrinkage_prox = T.sqrt(slow_shrinkage_prox_a**2 + slow_shrinkage_prox_b**2)
    #amp_shrinkage = 1. - (lam_slow*lam_sparse/L)*amp_slow_shrinkage_prox
    amp_shrinkage = 1. - (lam_sparse/L)/amp_slow_shrinkage_prox
    amp_value = T.switch(T.gt(amp_shrinkage,0.),amp_shrinkage,0.)
    subspacel1_prox = T.zeros_like(a)
    subspacel1_prox = T.set_subtensor(subspacel1_prox[ ::2,:],amp_value*slow_shrinkage_prox_a)
    subspacel1_prox = T.set_subtensor(subspacel1_prox[1::2,:],amp_value*slow_shrinkage_prox_b)
    return subspacel1_prox
示例#11
0
文件: lds.py 项目: ddofer/breze
def filter_and_prob(inpt, transition, emission,
           visible_noise_mean, visible_noise_cov,
           hidden_noise_mean, hidden_noise_cov,
           initial_hidden, initial_hidden_cov):
    step = forward_step(
        transition, emission,
        visible_noise_mean, visible_noise_cov,
        hidden_noise_mean, hidden_noise_cov)

    hidden_mean_0 = T.zeros_like(hidden_noise_mean).dimshuffle('x', 0)
    hidden_cov_0 = T.zeros_like(hidden_noise_cov).dimshuffle('x', 0, 1)
    f0, F0, ll0 = step(inpt[0], hidden_mean_0, hidden_cov_0)
    replace = {hidden_noise_mean: initial_hidden, 
               hidden_noise_cov: initial_hidden_cov}
    f0 = theano.clone(f0, replace)
    F0 = theano.clone(F0, replace)
    ll0 = theano.clone(ll0, replace)

    (f, F, ll), _ = theano.scan(
        step,
        sequences=inpt[1:],
        outputs_info=[f0, F0, None])

    ll = ll.sum(axis=0)

    f = T.concatenate([T.shape_padleft(f0), f])
    F = T.concatenate([T.shape_padleft(F0), F])
    ll += ll0

    return f, F, ll
示例#12
0
    def recur(self, ms_j, mt_jm1, mscut_j, mtcut_jm1,
            ssrcpos_js, vsrcpos_js, starpos_js, vtarpos_js ):
        
         # cnn encoding
        ngms_j,  uttms_j   = self.sCNN.encode(ms_j,  mscut_j)
        ngmt_jm1,uttmt_jm1 = self.tCNN.encode(mt_jm1,mtcut_jm1)
        
        # padding dummy vector
        ngms_j   = T.concatenate([ngms_j,T.zeros_like(ngms_j[-1:,:])],axis=0)
        ngmt_jm1 = T.concatenate([ngmt_jm1,T.zeros_like(ngmt_jm1[-1:,:])],axis=0)

        # source features
        ssrcemb_js = T.sum(ngms_j[ssrcpos_js,:],axis=0)
        vsrcemb_js = T.sum(ngms_j[vsrcpos_js,:],axis=0)
        src_js = T.concatenate([ssrcemb_js,vsrcemb_js,uttms_j],axis=0)
        
        # target features
        staremb_js = T.sum(ngmt_jm1[starpos_js,:],axis=0)
        vtaremb_js = T.sum(ngmt_jm1[vtarpos_js,:],axis=0)
        tar_js = T.concatenate([staremb_js,vtaremb_js,uttmt_jm1],axis=0)
       
        # update g_j
        g_j   = T.dot( self.Whb, T.nnet.sigmoid( 
                T.dot(src_js,self.Wfbs) + 
                T.dot(tar_js,self.Wfbt) +
                self.B0)).dimshuffle('x')
        # update b_j
        g_j = T.concatenate([g_j,self.B],axis=0)
        b_j = T.nnet.softmax( g_j )[0,:]
        
        return b_j
示例#13
0
def T_subspacel1_slow_shrinkage_conv(a, L, lam_sparse, lam_slow, imshp,kshp,featshp,stride=(1,1),small_value=.001):
    featshp = (imshp[0],kshp[0],featshp[2],featshp[3]) # num images, features, szy, szx
    features = T.reshape(T.transpose(a),featshp,ndim=4)

    amp = T.sqrt(features[:,::2,:,:]**2 + features[:,1::2,:,:]**2 + small_value)
    #damp = amp[:,1:] - amp[:,:-1]

    # compose slow shrinkage with subspace l1 shrinkage

    # slow shrinkage
    div = T.zeros_like(amp)
    d1 = amp[1:,:,:,:] - amp[:-1,:,:,:]
    d2 = d1[1:,:,:,:] - d1[:-1,:,:,:]
    div = T.set_subtensor(div[1:-1,:,:,:], -d2)
    div = T.set_subtensor(div[0,:,:,:], -d1[0,:,:,:])
    div = T.set_subtensor(div[-1,:,:,:], d1[-1,:,:,:])
    slow_amp_shrinkage = 1 - (lam_slow / L) * (div / amp)
    slow_amp_value = T.switch(T.gt(slow_amp_shrinkage, 0), slow_amp_shrinkage, 0)
    slow_shrinkage_prox_a = slow_amp_value * features[:, ::2, :,:]
    slow_shrinkage_prox_b = slow_amp_value * features[:,1::2, :,:]

    # subspace l1 shrinkage
    amp_slow_shrinkage_prox = T.sqrt(slow_shrinkage_prox_a ** 2 + slow_shrinkage_prox_b ** 2)
    #amp_shrinkage = 1. - (lam_slow*lam_sparse/L)*amp_slow_shrinkage_prox
    amp_shrinkage = 1. - (lam_sparse / L) / amp_slow_shrinkage_prox
    amp_value = T.switch(T.gt(amp_shrinkage, 0.), amp_shrinkage, 0.)
    subspacel1_prox = T.zeros_like(features)
    subspacel1_prox = T.set_subtensor(subspacel1_prox[:, ::2, :,:], amp_value * slow_shrinkage_prox_a)
    subspacel1_prox = T.set_subtensor(subspacel1_prox[:,1::2, :,:], amp_value * slow_shrinkage_prox_b)

    reshape_subspacel1_prox = T.transpose(T.reshape(subspacel1_prox,(featshp[0],featshp[1]*featshp[2]*featshp[3]),ndim=2))
    return reshape_subspacel1_prox
示例#14
0
    def get_aggregator(self):
        initialized = shared_like(0.)
        numerator_acc = shared_like(self.numerator)
        denominator_acc = shared_like(self.denominator)

        # Dummy default expression to use as the previously-aggregated
        # value, that has the same shape as the new result
        numerator_zeros = tensor.as_tensor(self.numerator).zeros_like()
        denominator_zeros = tensor.as_tensor(self.denominator).zeros_like()

        conditional_update_num = self.numerator + ifelse(initialized,
                                                         numerator_acc,
                                                         numerator_zeros)
        conditional_update_den = self.denominator + ifelse(initialized,
                                                           denominator_acc,
                                                           denominator_zeros)

        initialization_updates = [(numerator_acc,
                                   tensor.zeros_like(numerator_acc)),
                                  (denominator_acc,
                                   tensor.zeros_like(denominator_acc)),
                                  (initialized, 0.)]
        accumulation_updates = [(numerator_acc,
                                 conditional_update_num),
                                (denominator_acc,
                                 conditional_update_den),
                                (initialized, 1.)]
        aggregator = Aggregator(aggregation_scheme=self,
                                initialization_updates=initialization_updates,
                                accumulation_updates=accumulation_updates,
                                readout_variable=(numerator_acc /
                                                  denominator_acc))
        return aggregator
示例#15
0
文件: iq.py 项目: zenna/Arrows.jl
def castray(ro, rd, shape_params, nprims, width, height):
    tmin = 1.0
    tmax = 20.0
    precis = 0.002
    m = -1.0
    # There are a sequence of distances, d1, d2, ..., dn
    # then theres the accumulated distances d1, d1+d2, d1+d2+d3....
    # What we actually want in the output is the sfor each ray the distance to the surface
    # So we want something like 0, 20, 25, 27, 28, 28, 28, 28, 28
    # OK

    max_num_steps = 25

    # distcolors = map(ro + rd * 0, width, height) #FIXME, reshape instead of mul by 0
    distcolors = mapedit(ro + rd * 0, shape_params, nprims, width, height)
    dists = distcolors
    steps = T.switch(dists < precis, T.zeros_like(dists), T.ones_like(dists))
    accum_dists = T.reshape(dists, (width, height, 1))

    for i in range(max_num_steps - 1):
        # distcolors = map(ro + rd * accum_dists, width, height) #FIXME, reshape instead of mul by 0
        distcolors = mapedit(ro + rd * accum_dists, shape_params, nprims, width, height) #FIXME, reshape instead of mul by 0
        dists = distcolors
        steps = steps + T.switch(dists < precis, T.zeros_like(dists), T.ones_like(dists))
        accum_dists = accum_dists + T.reshape(dists, (width, height, 1))

    last_depth = T.reshape(accum_dists, (width, height))
    depthmap = T.switch(last_depth < tmax, last_depth / tmax, T.zeros_like(last_depth))
    color = 1.0 - steps / float(max_num_steps)
    # Distance marched along ray and delta between last two steps
    return depthmap
示例#16
0
文件: terms.py 项目: dfm/exoplanet
    def get_celerite_matrices(self, x, diag):
        x = tt.as_tensor_variable(x)
        diag = tt.as_tensor_variable(diag)
        ar, cr, ac, bc, cc, dc = self.coefficients
        a = diag + tt.sum(ar) + tt.sum(ac)
        U = tt.concatenate((
            ar[None, :] + tt.zeros_like(x)[:, None],
            ac[None, :] * tt.cos(dc[None, :] * x[:, None])
            + bc[None, :] * tt.sin(dc[None, :] * x[:, None]),
            ac[None, :] * tt.sin(dc[None, :] * x[:, None])
            - bc[None, :] * tt.cos(dc[None, :] * x[:, None]),
        ), axis=1)

        V = tt.concatenate((
            tt.zeros_like(ar)[None, :] + tt.ones_like(x)[:, None],
            tt.cos(dc[None, :] * x[:, None]),
            tt.sin(dc[None, :] * x[:, None]),
        ), axis=1)

        dx = x[1:] - x[:-1]
        P = tt.concatenate((
            tt.exp(-cr[None, :] * dx[:, None]),
            tt.exp(-cc[None, :] * dx[:, None]),
            tt.exp(-cc[None, :] * dx[:, None]),
        ), axis=1)

        return a, U, V, P
示例#17
0
    def __call__(self, input_, *xs):
        '''
        Maybe unclear: input_ is the variable to be scaled, xs are the
        actual inputs.
        '''
        updates = theano.OrderedUpdates()

        if len(xs) != len(self.dims_in):
            raise ValueError('Number of (external) inputs for baseline must'
                             ' match parameters')

        ws = []
        for i in xrange(len(xs)):
            # Maybe not the most pythonic way...
            ws.append(self.__dict__['w%d' % i])

        ids = T.sum([x.dot(W) for x, W in zip(xs, ws)], axis=0).T
        ids_c = T.zeros_like(ids) + ids
        input_scaled = input_ / ids_c
        input_ = T.zeros_like(input_) + input_

        outs = OrderedDict(
            x_c=input_,
            x_scaled=input_scaled,
            ids=ids,
            ids_c=ids_c
        )

        return outs, updates
    def get_output_for(self,net_input,**kwargs):
        if 'unary' in kwargs and kwargs['unary']==True:
            return net_input

        logger.info('Initializing the messages')
        Wp=self.W
        unary_sequence = net_input.dimshuffle(1,0,2)    #Reshuffling the batched unary potential shape so that it can be used for word level iterations in theano.scan

        def forward_scan1(unary_sequence,forward_sm,Wp):
            forward_sm=forward_sm+unary_sequence
            forward_sm=theano_logsumexp(forward_sm.dimshuffle(0,1,'x')+Wp,1)
            return forward_sm

        def backward_scan1(unary_sequence,forward_sm,Wp):
            forward_sm=forward_sm+unary_sequence
            forward_sm=theano_logsumexp(forward_sm.dimshuffle(0,1,'x')+Wp.T,1)
            return forward_sm


        forward_results,_=theano.scan(fn=forward_scan1,sequences=[unary_sequence],outputs_info=T.zeros_like(unary_sequence[0]),non_sequences=[Wp],n_steps=unary_sequence.shape[0]-1)
        backward_results,_=theano.scan(fn=backward_scan1,sequences=[unary_sequence[::-1]],outputs_info=T.zeros_like(unary_sequence[0]),non_sequences=[Wp],n_steps=unary_sequence.shape[0]-1)

        backward_results=T.concatenate([backward_results[::-1],T.zeros_like(backward_results[:1])],axis=0)
        forward_results=T.concatenate([T.zeros_like(forward_results[:1]),forward_results],axis=0)

        unnormalized_prob = forward_results+unary_sequence+backward_results
        marginal_results = theano_logsumexp(unnormalized_prob,axis=2)
        normalized_prob = unnormalized_prob - marginal_results.dimshuffle(0,1,'x')
        # provided for debugging purposes.
        #marginal_all = theano.function([l_in.input_var,l_mask.input_var],marginal_results)
        #probs=theano.function([l_in.input_var,l_mask.input_var],normalized_prob.dimshuffle(1,0,2))
        if 'normalized' in kwargs and kwargs['normalized']==True:
            return normalized_prob.dimshuffle(1,0,2)
        else:
            return unnormalized_prob.dimshuffle(1,0,2)
示例#19
0
    def get_aggregator(self):
        initialized = shared_like(0.)
        total_acc = shared_like(self.variable)

        total_zeros = tensor.as_tensor(self.variable).zeros_like()

        conditional_update_num = self.variable + ifelse(initialized,
                                                         total_acc,
                                                         total_zeros)

        initialization_updates = [(total_acc,
                                   tensor.zeros_like(total_acc)),
                                  (initialized,
                                   tensor.zeros_like(initialized))]

        accumulation_updates = [(total_acc,
                                 conditional_update_num),
                                (initialized, tensor.ones_like(initialized))]

        aggregator = Aggregator(aggregation_scheme=self,
                                initialization_updates=initialization_updates,
                                accumulation_updates=accumulation_updates,
                                readout_variable=(total_acc))

        return aggregator
示例#20
0
    def grad(self, inputs, out_grads):
        batch_mean, rolling_mean, rolling_grad, alpha = inputs
        out_grad, = out_grads

        if self.update_averages:
            assert treeano.utils.is_shared_variable(rolling_mean)
            assert treeano.utils.is_shared_variable(rolling_grad)
            # HACK this is super hacky and won't work for certain
            # computation graphs
            # TODO make assertion again
            if (hasattr(rolling_mean, "default_update") or
                    hasattr(rolling_grad, "default_update")):
                warnings.warn("rolling mean/grad already has updates - "
                              "overwritting. this can be caused by calculating "
                              "the gradient of backprop to the future mean "
                              "multiple times")

            rolling_mean.default_update = (alpha * rolling_mean +
                                           (1 - alpha) * batch_mean)
            rolling_grad.default_update = (alpha * rolling_grad +
                                           (1 - alpha) * out_grad)
        else:
            # HACK remove default_update
            if hasattr(rolling_mean, "default_update"):
                delattr(rolling_mean, "default_update")
            if hasattr(rolling_grad, "default_update"):
                delattr(rolling_grad, "default_update")

        return [rolling_grad,
                T.zeros_like(rolling_mean),
                T.zeros_like(rolling_grad),
                T.zeros_like(alpha)]
示例#21
0
def generic_compute_Lx_batches(samples, weights, biases, bs, cbs):
    tsamples = [x.reshape((bs//cbs, cbs, x.shape[1])) for x in samples]
    final_ws = [T.unbroadcast(T.shape_padleft(T.zeros_like(x)),0)
                for x in weights]
    final_bs = [T.unbroadcast(T.shape_padleft(T.zeros_like(x)),0)
                for x in biases]
    n_samples = len(samples)
    n_weights = len(weights)
    n_biases = len(biases)
    def comp_step(*args):
        lsamples = args[:n_samples]
        terms1 = generic_compute_Lx_term1(lsamples, weights, biases)
        rval = []
        for (term1, acc) in zip(terms1, args[n_samples:]):
            rval += [acc + term1]
        return rval

    rvals,_ = theano.sandbox.scan.scan(
        comp_step,
        sequences=tsamples,
        states=final_ws + final_bs,
        n_steps=bs // cbs,
        profile=0,
        mode=theano.Mode(linker='cvm_nogc'),
        flags=['no_optimization'] )
    accs1 = [x[0]/numpy.float32(bs//cbs) for x in rvals]
    accs2 = generic_compute_Lx_term2(samples,weights,biases)
    return [x - y for x, y in zip(accs1, accs2)]
示例#22
0
def compute_Lx_batches(v, g, h, xw_mat, xv_mat, xa, xb, xc, bs, cbs):
    xw = xw_mat.flatten()
    xv = xv_mat.flatten()
    tv = v.reshape((bs // cbs, cbs, v.shape[1]))
    tg = g.reshape((bs // cbs, cbs, g.shape[1]))
    th = h.reshape((bs // cbs, cbs, h.shape[1]))

    final_w1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xw_mat)),0)
    final_v1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xv_mat)),0)
    final_a1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xa)),0)
    final_b1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xb)),0)
    final_c1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xc)),0)
    def comp_step(lv, lg, lh,
                  acc_w1, acc_v1, acc_a1, acc_b1, acc_c1):
        terms1 = compute_Lx_term1(lv, lg, lh, xw, xv, xa, xb, xc)
        accs1 = [acc_w1, acc_v1, acc_a1, acc_b1, acc_c1]
        rval = []

        for (term1, acc) in zip(terms1,accs1):
            rval += [acc + term1]
        return rval
    rvals,_ = theano.sandbox.scan.scan(
        comp_step,
        sequences=[tv,tg,th],
        states=[
            final_w1, final_v1, final_a1, final_b1, final_c1],
        n_steps=bs // cbs,
        profile=0,
        mode=theano.Mode(linker='cvm_nogc'),
        flags=['no_optimization'] )
    accs1 = [x[0]/numpy.float32(bs//cbs) for x in rvals]
    accs2 = compute_Lx_term2(v,g,h,xw,xv,xa,xb,xc)
    return [x - y for x, y in zip(accs1, accs2)]
示例#23
0
文件: VAEB.py 项目: budzianowski/VAEB
 def reconstruct(self, x, n_samples) :
     mu, log_sigma = self.encoder(x)
     if n_samples <= 0 :
         y = self.decoder(mu)
     else :
         #sample from posterior
         if self.continuous :
             #hack to find out size of variables
             (y_mu, y_log_sigma) = self.decoder(mu)
             (y_mu, y_log_sigma) = (T.zeros_like(y_mu), T.zeros_like(y_log_sigma))
         else :
             y = T.zeros(x.shape)
         for i in range(n_samples) :
             z = reparam_trick(mu, log_sigma, self.srng)
             if self.continuous :
                 (new_y_mu, new_y_log_sigma) = self.decoder(z)
                 y_mu = y_mu + new_y_mu
                 y_log_sigma = y_log_sigma + new_y_log_sigma
             else :
                 y = y + self.decoder(z)
         if self.continuous :
             y_mu = y_mu / n_samples
             y_log_sigma = y_log_sigma / n_samples
             y = (y_mu, y_log_sigma)
         else :
             y = (y / n_samples)
     if self.continuous :
         (y_mu, y_log_sigma) = y
         I = T.eye(y_mu.shape[0])
         cov = (T.pow(T.exp(y_log_sigma), 2)) * I
         y = np.random.multivariate_normal(y_mu.eval(), cov.eval())
     else :
         y = y.eval()
     return y
示例#24
0
    def get_aggregator(self):
        initialized = shared_like(0.)
        numerator_acc = shared_like(self.numerator)
        denominator_acc = shared_like(self.denominator)

        conditional_update_num = ifelse(initialized,
                                        self.numerator + numerator_acc,
                                        self.numerator)
        conditional_update_den = ifelse(initialized,
                                        self.denominator + denominator_acc,
                                        self.denominator)

        initialization_updates = [(numerator_acc,
                                   tensor.zeros_like(numerator_acc)),
                                  (denominator_acc,
                                   tensor.zeros_like(denominator_acc)),
                                  (initialized, 0.)]
        accumulation_updates = [(numerator_acc,
                                 conditional_update_num),
                                (denominator_acc,
                                 conditional_update_den),
                                (initialized, 1.)]
        aggregator = Aggregator(aggregation_scheme=self,
                                initialization_updates=initialization_updates,
                                accumulation_updates=accumulation_updates,
                                readout_variable=(numerator_acc /
                                                  denominator_acc))
        return aggregator
示例#25
0
文件: ctc.py 项目: choko/ctc
def compute_cost_log_in_parallel(original_rnn_outputs, labels, func, x_ends, y_ends):
	mask = T.log(1 - T.or_(T.eq(labels, T.zeros_like(labels)), T.eq(labels, shift_matrix(labels, 2))))

	initial_state = T.log(T.zeros_like(labels))
	initial_state = T.set_subtensor(initial_state[:,0], 0)

	def select_probabilities(rnn_outputs, label):
		return rnn_outputs[:,label]	

	rnn_outputs, _ = theano.map(select_probabilities, [original_rnn_outputs, labels])
	rnn_outputs = T.log(rnn_outputs.dimshuffle((1,0,2)))

	def forward_step(probabilities, last_probabilities):
		all_forward_probabilities = T.stack(
			last_probabilities + probabilities,
			log_shift_matrix(last_probabilities, 1) + probabilities,
			log_shift_matrix(last_probabilities, 2) + probabilities + mask,
		)

		result = func(all_forward_probabilities, 0)
		return result

	forward_probabilities, _ = theano.scan(fn = forward_step, sequences = rnn_outputs, outputs_info = initial_state)
	forward_probabilities = forward_probabilities.dimshuffle((1,0,2))

	def compute_cost(forward_probabilities, x_end, y_end):
		return -func(forward_probabilities[x_end-1,y_end-2:y_end])

	return theano.map(compute_cost, [forward_probabilities, x_ends, y_ends])[0]
示例#26
0
def lstm(mask, state_in, t_params, n_dim_in, n_dim_out, prefix, one_step=False, init_h=None):
    '''
    Long Short-Term Memory (LSTM) layer
    '''
    def _step(_mask, _state_in, _prev_h, _prev_c):
        _pre_act = tensor.dot(_prev_h, t_params[_concat(prefix, 'U')]) + _state_in

        _gate_i = tensor.nnet.sigmoid(_slice(_pre_act, 0, n_dim_out))
        _gate_f = tensor.nnet.sigmoid(_slice(_pre_act, 1, n_dim_out))
        _gate_o = tensor.nnet.sigmoid(_slice(_pre_act, 2, n_dim_out))

        _next_c = _gate_f * _prev_c + _gate_i * tensor.tanh(_slice(_pre_act, 3, n_dim_out))
        _next_c = _mask[:, None] * _next_c + (1. - _mask)[:, None] * _prev_c
        _next_h = _gate_o * tensor.tanh(_next_c)
        _next_h = _mask[:, None] * _next_h + (1. - _mask)[:, None] * _prev_h

        return _next_h, _next_c

    params = OrderedDict()
    params[_concat(prefix, 'W')] = numpy.concatenate([ortho_weight(n_dim_in, n_dim_out), ortho_weight(n_dim_in, n_dim_out), ortho_weight(n_dim_in, n_dim_out), ortho_weight(n_dim_in, n_dim_out)], 1)
    params[_concat(prefix, 'U')] = numpy.concatenate([ortho_weight(n_dim_out, n_dim_out), ortho_weight(n_dim_out, n_dim_out), ortho_weight(n_dim_out, n_dim_out), ortho_weight(n_dim_out, n_dim_out)], 1)
    params[_concat(prefix, 'b')] = numpy.zeros((4 * n_dim_out,), config.floatX)
    init_t_params(params, t_params)

    state_in = (tensor.dot(state_in, t_params[_concat(prefix, 'W')]) + t_params[_concat(prefix, 'b')])
    if init_h is None:
        init_h = tensor.alloc(to_floatX(0.), state_in.shape[-2], n_dim_out)
    if one_step:
        state_out, _ = _step(mask, state_in, init_h, tensor.zeros_like(init_h))
        return state_out
    else:
        [state_out, _], _ = theano.scan(_step, [mask, state_in], [init_h, tensor.zeros_like(init_h)])
        return state_out
示例#27
0
 def _construct_compute_fe_terms(self):
     """
     Construct theano function to compute the log-likelihood and posterior
     KL-divergence terms for the variational free-energy.
     """
     # setup some symbolic variables for theano to deal with
     Xd = T.matrix()
     Xc = T.zeros_like(Xd)
     Xm = T.zeros_like(Xd)
     # construct values to output
     if self.x_type == 'bernoulli':
         ll_term = log_prob_bernoulli(self.x, self.xg)
     else:
         ll_term = log_prob_gaussian2(self.x, self.xg, \
                 log_vars=self.bounded_logvar)
     all_klds = gaussian_kld(self.q_z_given_x.output_mean, \
             self.q_z_given_x.output_logvar, \
             self.prior_mean, self.prior_logvar)
     kld_term = T.sum(all_klds, axis=1)
     # compile theano function for a one-sample free-energy estimate
     fe_term_sample = theano.function(inputs=[Xd], \
             outputs=[ll_term, kld_term], \
             givens={self.Xd: Xd, self.Xc: Xc, self.Xm: Xm})
     # construct a wrapper function for multi-sample free-energy estimate
     def fe_term_estimator(X, sample_count):
         ll_sum = np.zeros((X.shape[0],))
         kld_sum = np.zeros((X.shape[0],))
         for i in range(sample_count):
             result = fe_term_sample(X)
             ll_sum = ll_sum + result[0].ravel()
             kld_sum = kld_sum + result[1].ravel()
         mean_nll = -ll_sum / float(sample_count)
         mean_kld = kld_sum / float(sample_count)
         return [mean_nll, mean_kld]
     return fe_term_estimator
示例#28
0
文件: OpBLSTM.py 项目: atuxhe/returnn
  def grad(self, inputs, output_grads):
    Z_f, Z_b, V_f, V_b, c_f, c_b, i_f, i_b = inputs
    DY_f, DY_b, DH_f, DH_b, Dd_f, Dd_b = output_grads

    Z_f_raw = Z_f.owner.inputs[0].owner.inputs[0]
    Z_b_raw = Z_b.owner.inputs[0].owner.inputs[0]
    #TODO!!!
    V_f_raw = V_f.owner.inputs[0]
    V_b_raw = V_b.owner.inputs[0]
    c_f_raw = c_f.owner.inputs[0].owner.inputs[0]
    c_b_raw = c_b.owner.inputs[0].owner.inputs[0]
    i_f_raw = i_f.owner.inputs[0].owner.inputs[0]
    i_b_raw = i_b.owner.inputs[0].owner.inputs[0]
    #we have to make sure that this in only computed once!
    #for this we have to extract the raw variables before conversion to continuous gpu array
    #so that theano can merge the nodes
    Y_f, Y_b, H_f, H_b, d_f, d_b = BLSTMOpInstance(Z_f_raw, Z_b_raw, V_f_raw, V_b_raw, c_f_raw, c_b_raw, i_f_raw, i_b_raw)
    if isinstance(DY_f.type, theano.gradient.DisconnectedType):
      DY_f = T.zeros_like(Z_f)
    if isinstance(DY_b.type, theano.gradient.DisconnectedType):
      DY_b = T.zeros_like(Z_b)
    if isinstance(Dd_f.type, theano.gradient.DisconnectedType):
      Dd_f = T.zeros_like(c_f)
    if isinstance(Dd_b.type, theano.gradient.DisconnectedType):
      Dd_b = T.zeros_like(c_b)
    DZ_f, DZ_b, DV_f, DV_b, Dc_f, Dc_b = BLSTMOpGradNoInplaceInstance(V_f, V_b, c_f, c_b, i_f, i_b, Dd_f, Dd_b, DY_f, DY_b, Y_f, Y_b, H_f, H_b)
    Di_f = theano.gradient.grad_undefined(self, 5, inputs[5], 'cannot diff w.r.t. index')
    Di_b = theano.gradient.grad_undefined(self, 6, inputs[6], 'cannot diff w.r.t. index')

    return [DZ_f, DZ_b, DV_f, DV_b, Dc_f, Dc_b, Di_f, Di_b]
    def build_gsn(self, add_noise, hiddens=None, reverse=False):
        p_X_chain = []
        # Whether or not to corrupt the visible input X
        if add_noise:
            X_init = self.input_noise(self.X)
        else:
            X_init = self.X

        # if no input hiddens were provided, initialize with zeros
        if hiddens is None:
            # init hiddens with zeros
            hiddens = [X_init]
            if self.tied_weights:
                for w in self.weights_list:
                    hiddens.append(T.zeros_like(T.dot(hiddens[-1], w)))
            else:
                for w in self.weights_list[:self.layers]:
                    hiddens.append(T.zeros_like(T.dot(hiddens[-1], w)))

        # The layer update scheme
        log.info("Building the GSN graph : %s updates", str(self.walkbacks))
        for i in range(self.walkbacks):
            log.debug("GSN Walkback %s/%s", str(i + 1), str(self.walkbacks))
            self.update_layers(hiddens, p_X_chain, add_noise, reverse=reverse)
        return p_X_chain, hiddens
示例#30
0
def group_div(X, W, H, beta, params):
    """Compute beta divergence D(X|WH), intra-class distance
    and intra-session distance for a particular
    (class, session) couple [1]_.


    Parameters
    ----------
    X : Theano tensor
        data
    W : Theano tensor
        Bases
    H : Theano tensor
        activation matrix
    beta : Theano scalar
    params : Theano tensor
        Matrix of parameter related to class/session.
            :params[0][0]: index for the (class, session) couple
            :params[1][0]: number of vector basis related to class
            :params[1][1]: number of vector basis related to session
            :params[2]: weight on the class/session similarity constraints
            :params[3]: sessions in which class c appears
            :params[4]: classes present in session s



    Returns
    -------
    cost : Theano scalar
        total cost
    div : Theano scalar
        beta divergence D(X|WH)
    sum_cls : Theano scalar
        intra-class distance
    sum_ses : Theano scalar
        intra-session distance"""
    ind = params[0][0]
    k_cls = params[1][0]
    k_ses = params[1][1]
    lambdas = params[2]
    Sc = params[3]
    Cs = params[4]
    res_ses, up = theano.scan(
        fn=lambda Cs, prior_result: prior_result
        + eucl_dist(W[ind, :, k_cls : k_cls + k_ses], W[Cs, :, k_cls : k_cls + k_ses]),
        outputs_info=T.zeros_like(beta),
        sequences=Cs,
    )
    sum_ses = ifelse(T.gt(Cs[0], 0), res_ses[-1], T.zeros_like(beta))
    res_cls, up = theano.scan(
        fn=lambda Sc, prior_result: prior_result + eucl_dist(W[ind, :, 0:k_cls], W[Sc, :, 0:k_cls]),
        outputs_info=T.zeros_like(beta),
        sequences=Sc,
    )
    sum_cls = ifelse(T.gt(Sc[0], 0), res_cls[-1], T.zeros_like(beta))
    betaDiv = beta_div(X, W[ind].T, H, beta)

    cost = lambdas[0] * sum_cls + lambdas[1] * sum_ses + betaDiv
    return cost, betaDiv, sum_cls, sum_ses
示例#31
0
    def sym_gradients_new(self, X):
        non_linearity_name = self.parameters["nonlinearity"].get_name()
        assert (non_linearity_name == "sigmoid" or non_linearity_name == "RLU")
        # First element is different (it is predicted from the bias only)
        init_a = T.zeros_like(T.dot(X.T, self.W))  # BxH
        init_x = T.ones_like(X[0])

        def a_i_given_a_im1(x, w, a_prev, x_prev):
            a = a_prev + T.dot(T.shape_padright(x_prev, 1),
                               T.shape_padleft(w, 1))
            return (a, x)

        ([As, _], updates) = theano.scan(a_i_given_a_im1,
                                         sequences=[X, self.W],
                                         outputs_info=[init_a, init_x])
        top_activations = As[-1]
        Xs_m1 = T.set_subtensor(X[1:, :], X[0:-1, :])
        Xs_m1 = T.set_subtensor(Xs_m1[0, :], 1)

        # Reconstruct the previous activations and calculate (for that visible
        # dimension) the density and all the gradients
        def density_and_gradients(x_i, x_im1, w_i, V_alpha, b_alpha, V_mu,
                                  b_mu, V_sigma, b_sigma, activation_factor,
                                  a_i, lp_accum, dP_da_ip1):
            B = T.cast(x_i.shape[0], theano.config.floatX)
            pot = a_i * activation_factor
            h = self.nonlinearity(pot)  # BxH

            z_alpha = T.dot(h, V_alpha) + T.shape_padleft(b_alpha)
            z_mu = T.dot(h, V_mu) + T.shape_padleft(b_mu)
            z_sigma = T.dot(h, V_sigma) + T.shape_padleft(b_sigma)

            Alpha = T.nnet.softmax(z_alpha)  # BxC
            Mu = z_mu  # BxC
            Sigma = T.exp(z_sigma)  # BxC

            Phi = -T.log(
                2 * Sigma) - T.abs_(Mu - T.shape_padright(x_i, 1)) / Sigma
            wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0))

            lp_current = log_sum_exp(wPhi)
            # lp_current_sum = T.sum(lp_current)

            Pi = T.exp(wPhi - T.shape_padright(lp_current, 1))  # #
            dp_dz_alpha = Pi - Alpha  # BxC
            # dp_dz_alpha = T.grad(lp_current_sum, z_alpha)
            gb_alpha = dp_dz_alpha.mean(0, dtype=theano.config.floatX)  # C
            gV_alpha = T.dot(h.T, dp_dz_alpha) / B  # HxC

            # dp_dz_mu = T.grad(lp_current_sum, z_mu)
            dp_dz_mu = Pi * T.sgn(T.shape_padright(x_i, 1) - Mu) / Sigma
            # dp_dz_mu = dp_dz_mu * Sigma
            gb_mu = dp_dz_mu.mean(0, dtype=theano.config.floatX)
            gV_mu = T.dot(h.T, dp_dz_mu) / B

            # dp_dz_sigma = T.grad(lp_current_sum, z_sigma)
            dp_dz_sigma = Pi * (T.abs_(T.shape_padright(x_i, 1) - Mu) / Sigma -
                                1)
            gb_sigma = dp_dz_sigma.mean(0, dtype=theano.config.floatX)
            gV_sigma = T.dot(h.T, dp_dz_sigma) / B

            dp_dh = T.dot(dp_dz_alpha, V_alpha.T) + T.dot(
                dp_dz_mu, V_mu.T) + T.dot(dp_dz_sigma, V_sigma.T)  # BxH
            if non_linearity_name == "sigmoid":
                dp_dpot = dp_dh * h * (1 - h)
            elif non_linearity_name == "RLU":
                dp_dpot = dp_dh * (pot > 0)

            gfact = (dp_dpot * a_i).sum(1).mean(
                0, dtype=theano.config.floatX)  # 1

            dP_da_i = dP_da_ip1 + dp_dpot * activation_factor  # BxH
            gW = T.dot(T.shape_padleft(x_im1, 1), dP_da_i).flatten() / B

            return (a_i -
                    T.dot(T.shape_padright(x_im1, 1), T.shape_padleft(w_i, 1)),
                    lp_accum + lp_current, dP_da_i, gW, gb_alpha, gV_alpha,
                    gb_mu, gV_mu, gb_sigma, gV_sigma, gfact)

        p_accum = T.zeros_like(X[0])
        dP_da_ip1 = T.zeros_like(top_activations)
        ([
            _, ps, _, gW, gb_alpha, gV_alpha, gb_mu, gV_mu, gb_sigma, gV_sigma,
            gfact
        ], updates2) = theano.scan(density_and_gradients,
                                   go_backwards=True,
                                   sequences=[
                                       X, Xs_m1, self.W, self.V_alpha,
                                       self.b_alpha, self.V_mu, self.b_mu,
                                       self.V_sigma, self.b_sigma,
                                       self.activation_rescaling
                                   ],
                                   outputs_info=[
                                       top_activations, p_accum, dP_da_ip1,
                                       None, None, None, None, None, None,
                                       None, None
                                   ])
        # scan with go_backwards returns the matrices in the order they were
        # created, so we have to reverse the order of the rows
        gW = gW[::-1, :]
        gb_alpha = gb_alpha[::-1, :]
        gV_alpha = gV_alpha[::-1, :, :]
        gb_mu = gb_mu[::-1, :]
        gV_mu = gV_mu[::-1, :, :]
        gb_sigma = gb_sigma[::-1, :]
        gV_sigma = gV_sigma[::-1, :, :]
        gfact = gfact[::-1]

        updates.update(updates2)  # Returns None
        return (ps[-1], gW, gb_alpha, gV_alpha, gb_mu, gV_mu, gb_sigma,
                gV_sigma, gfact, updates)
示例#32
0
    def test_kl_equivalence(self):

        "tests that the kl divergence for the two models is the same "

        """ This is a tricky task. The full KL-divergence is not tractable,
        but this is the quantity that's known to be the same for the two models
        (since the PDDBM should have 0 KL-divergence from g, since its weights
        are fixed to 0). The quantity we monitor inside the models is the
        "truncated KL divergence", the portion that depends on the variational
        parameters. In this case (S3C / PD-DBM with DBM weights fixed to 0) the
        partition function is also tractable, so we can include the terms that
        depend on the partition function. Fortunately this is enough of the
        KL divergence to guarantee that the quantity is the same for both models.
        There's another term that depends on P(v) which is still intractable but
        g has no effect on P(v) in this case since the DBM weights are fixed to
        0. """


        """

        Let Z represent all latent vars, V all visible vars

        KL(Q(Z)||P(Z|v)) = \sum_z Q(z) log Q(z) / P(z | v)
                         = \sum_z Q(z) log Q(z) - \sum_z Q(z) log P(z | v)
                         = - H_Q(Z) - \sum_z Q(z) log P(z,v) + sum_z Q(z) log P(v)
                         = - H_Q(Z) - \sum_z Q(z) log exp(-E(z,v))/Z + log P(v)
                         = - H_Q(Z) - \sum_z Q(z) log exp(-E(z,v))
                                    + \sum_z Q(z) Z + log P(v)
                         = - H_Q(Z) + \sum_z Q(z) E(z,v) + log Z + log P(v)
                         = - H_Q(Z) + E_{z\simQ}[E(z,v)] + log Z + log P(v)

        """


        model = self.model
        ip = self.inference_procedure
        e_step = self.e_step
        X = self.X

        assert X.shape[0] == self.m

        H = np.cast[config.floatX](self.model.rng.uniform(0.,1.,(self.m, self.N)))
        S = np.cast[config.floatX](self.model.rng.uniform(-5.,5.,(self.m, self.N)))
        G = np.cast[config.floatX](
                broadcast(
                    sigmoid(self.model.dbm.rbms[0].bias_hid.get_value()), self.m))

        H_var = T.matrix(name='H_var')
        H_var.tag.test_value = H
        S_var = T.matrix(name='S_var')
        S_var.tag.test_value = S
        G_var = T.matrix(name='G_var')
        G_var.tag.test_value = G


        dbm_sigma0 = ip.infer_var_s0_hat()
        dbm_Sigma1 = ip.infer_var_s1_hat()

        dbm_trunc_kl = ip.truncated_KL( V = X, obs = { 'H_hat' : H_var,
                                                 'S_hat' : S_var,
                                                 'var_s0_hat' : dbm_sigma0,
                                                 'var_s1_hat' : dbm_Sigma1,
                                                 'G_hat' : ( G_var, ) } ).mean()

        #just the part related to G (check that it all comes out to 0)
        #dbm_trunc_kl = - entropy_binary_vector( G_var ).mean() - T.dot(G_var.mean(axis=0),self.model.dbm.rbms[0].bias_hid)

        assert len(dbm_trunc_kl.type.broadcastable) == 0


        s3c_sigma0 = e_step.infer_var_s0_hat()
        s3c_Sigma1 = e_step.infer_var_s1_hat()
        s3c_mu0 = T.zeros_like(self.s3c.mu)

        s3c_trunc_kl = e_step.truncated_KL( V = X, obs = { 'H_hat' : H_var,
            'S_hat' : S_var,
            'var_s0_hat' : s3c_sigma0,
            'var_s1_hat' : s3c_Sigma1 } )


        dbm_log_partition_function = self.model.s3c.log_partition_function() \
                + T.nnet.softplus(self.model.dbm.rbms[0].bias_hid).sum()

        #just the part related to G (check that it all comes out to 0)
        #dbm_log_partition_function = T.nnet.softplus(self.model.dbm.rbms[0].bias_hid).sum()

        s3c_log_partition_function = self.s3c.log_partition_function()

        s3c_partial_kl = s3c_trunc_kl.mean() + s3c_log_partition_function
        assert len(s3c_partial_kl.type.broadcastable) == 0
        dbm_partial_kl = dbm_trunc_kl + dbm_log_partition_function

        s3c_partial_kl, dbm_partial_kl = function([H_var,S_var,G_var],
                (s3c_partial_kl, dbm_partial_kl))(H,S,G)


        print s3c_partial_kl
        print dbm_partial_kl

        assert np.allclose(s3c_partial_kl, dbm_partial_kl)
示例#33
0
def SEIR(
    lambda_t_log,
    pr_beta_I_begin=100,
    pr_beta_new_E_begin=50,
    pr_median_mu=1 / 8,
    pr_mean_median_incubation=4,
    pr_sigma_median_incubation=1,
    sigma_incubation=0.4,
    pr_sigma_mu=0.2,
    model=None,
    return_all=False,
    save_all=False,
    name_median_incubation="median_incubation",
):
    r"""
        Implements a model similar to the susceptible-exposed-infected-recovered model. Instead of a exponential decaying
        incubation period, the length of the period is lognormal distributed. The complete equation is:

         .. math::

            E_{\text{new}}(t) &= \lambda_t I(t-1) \frac{S(t)}{N}   \\
            S(t) &= S(t-1) - E_{\text{new}}(t)  \\
            I_\text{new}(t) &= \sum_{k=1}^{10} \beta(k) E_{\text{new}}(t-k)   \\
            I(t) &= I(t-1) + I_{\text{new}}(t) - \mu  I(t) \\
            \beta(k) & = P(k) \sim LogNormal(\text{log}(d_{\text{incubation}})), \text{sigma\_incubation})

        The recovery rate :math:`\mu` and the incubation period is the same for all regions and follow respectively:

        .. math::

             P(\mu) &\sim LogNormal(\text{log(pr\_median\_mu)), pr\_sigma\_mu}) \\
             P(d_{\text{incubation}}) &\sim Normal(\text{pr\_mean\_median\_incubation, pr\_sigma\_median\_incubation})

        The initial number of infected and newly exposed differ for each region and follow prior
        :class:`~pymc3.distributions.continuous.HalfCauchy` distributions:

        .. math::

             E(t)  &\sim HalfCauchy(\text{pr\_beta\_E\_begin}) \:\: \text{ for} \: t \in \{-9, -8, ..., 0\}\\
             I(0)  &\sim HalfCauchy(\text{pr\_beta\_I\_begin}).


        Parameters
        ----------
        lambda_t_log : :class:`~theano.tensor.TensorVariable`
            time series of the logarithm of the spreading rate, 1 or 2-dimensional. If 2-dimensional, the first
            dimension is time.
        pr_beta_I_begin : float or array_like
            Prior beta of the :class:`~pymc3.distributions.continuous.HalfCauchy` distribution of :math:`I(0)`.
        pr_beta_new_E_begin : float or array_like
            Prior beta of the :class:`~pymc3.distributions.continuous.HalfCauchy` distribution of :math:`E(0)`.
        pr_median_mu : float or array_like
            Prior for the median of the :class:`~pymc3.distributions.continuous.Lognormal` distribution of the recovery rate :math:`\mu`.
        pr_mean_median_incubation :
            Prior mean of the :class:`~pymc3.distributions.continuous.Normal` distribution of the median incubation delay  :math:`d_{\text{incubation}}`.
            Defaults to 4 days [Nishiura2020]_, which is the median serial interval (the important measure here is not exactly
            the incubation period, but the delay until a person becomes infectious which seems to be about
            1 day earlier as showing symptoms).
        pr_sigma_median_incubation :
            Prior sigma of the :class:`~pymc3.distributions.continuous.Normal` distribution of the median incubation delay  :math:`d_{\text{incubation}}`.
            Default is 1 day.
        sigma_incubation :
            Scale parameter of the :class:`~pymc3.distributions.continuous.Lognormal` distribution of the incubation time/
            delay until infectiousness. The default is set to 0.4, which is about the scale found in [Nishiura2020]_, [Lauer2020]_.
        pr_sigma_mu : float or array_like
            Prior for the sigma of the lognormal distribution of recovery rate :math:`\mu`.
        model : :class:`Cov19Model`
          if none, it is retrieved from the context
        return_all : bool
            if True, returns ``new_I_t``, ``new_E_t``,  ``I_t``, ``S_t`` otherwise returns only ``new_I_t``
        save_all : bool
            if True, saves ``new_I_t``, ``new_E_t``, ``I_t``, ``S_t`` in the trace, otherwise it saves only ``new_I_t``
        name_median_incubation : str
            The name under which the median incubation time is saved in the trace

        Returns
        -------

        new_I_t : :class:`~theano.tensor.TensorVariable`
            time series of the number daily newly infected persons.
        new_E_t : :class:`~theano.tensor.TensorVariable`
            time series of the number daily newly exposed persons. (if return_all set to True)
        I_t : :class:`~theano.tensor.TensorVariable`
            time series of the infected (if return_all set to True)
        S_t : :class:`~theano.tensor.TensorVariable`
            time series of the susceptible (if return_all set to True)

        References
        ----------

        .. [Nishiura2020] Nishiura, H.; Linton, N. M.; Akhmetzhanov, A. R.
            Serial Interval of Novel Coronavirus (COVID-19) Infections.
            Int. J. Infect. Dis. 2020, 93, 284–286. https://doi.org/10.1016/j.ijid.2020.02.060.
        .. [Lauer2020] Lauer, S. A.; Grantz, K. H.; Bi, Q.; Jones, F. K.; Zheng, Q.; Meredith, H. R.; Azman, A. S.; Reich, N. G.; Lessler, J.
            The Incubation Period of Coronavirus Disease 2019 (COVID-19) From Publicly Reported Confirmed Cases: Estimation and Application.
            Ann Intern Med 2020. https://doi.org/10.7326/M20-0504.


    """
    model = modelcontext(model)

    # Build prior distrubutions:
    # --------------------------

    # Prior distribution of recovery rate mu
    mu = pm.Lognormal(
        name="mu",
        mu=np.log(pr_median_mu),
        sigma=pr_sigma_mu,
    )

    # Total number of people in population
    N = model.N_population

    # Number of regions as tuple of int
    num_regions = () if model.sim_ndim == 1 else model.sim_shape[1]

    # Prior distributions of starting populations (exposed, infectious, susceptibles)
    # We choose to consider the transitions of newly exposed people of the last 10 days.
    if num_regions == ():
        new_E_begin = pm.HalfCauchy(name="new_E_begin",
                                    beta=pr_beta_new_E_begin,
                                    shape=11)
    else:
        new_E_begin = pm.HalfCauchy(name="new_E_begin",
                                    beta=pr_beta_new_E_begin,
                                    shape=(11, num_regions))
    I_begin = pm.HalfCauchy(name="I_begin",
                            beta=pr_beta_I_begin,
                            shape=num_regions)
    S_begin = N - I_begin - pm.math.sum(new_E_begin, axis=0)

    lambda_t = tt.exp(lambda_t_log)
    new_I_0 = tt.zeros_like(I_begin)

    median_incubation = pm.Normal(
        name_median_incubation,
        mu=pr_mean_median_incubation,
        sigma=pr_sigma_median_incubation,
    )

    # Choose transition rates (E to I) according to incubation period distribution
    if not num_regions:
        x = np.arange(1, 11)
    else:
        x = np.arange(1, 11)[:, None]

    beta = mh.tt_lognormal(x, tt.log(median_incubation), sigma_incubation)

    # Runs SEIR model:
    def next_day(
        lambda_t,
        S_t,
        nE1,
        nE2,
        nE3,
        nE4,
        nE5,
        nE6,
        nE7,
        nE8,
        nE9,
        nE10,
        I_t,
        _,
        mu,
        beta,
        N,
    ):
        new_E_t = lambda_t / N * I_t * S_t
        S_t = S_t - new_E_t
        new_I_t = (beta[0] * nE1 + beta[1] * nE2 + beta[2] * nE3 +
                   beta[3] * nE4 + beta[4] * nE5 + beta[5] * nE6 +
                   beta[6] * nE7 + beta[7] * nE8 + beta[8] * nE9 +
                   beta[9] * nE10)
        I_t = I_t + new_I_t - mu * I_t
        I_t = tt.clip(I_t, 0, N)  # for stability
        S_t = tt.clip(S_t, 0, N)
        return S_t, new_E_t, I_t, new_I_t

    # theano scan returns two tuples, first one containing a time series of
    # what we give in outputs_info : S, E's, I, new_I
    outputs, _ = theano.scan(
        fn=next_day,
        sequences=[lambda_t],
        outputs_info=[
            S_begin,
            dict(initial=new_E_begin,
                 taps=[-1, -2, -3, -4, -5, -6, -7, -8, -9, -10]),
            I_begin,
            new_I_0,
        ],
        non_sequences=[mu, beta, N],
    )
    S_t, new_E_t, I_t, new_I_t = outputs
    pm.Deterministic("new_I_t", new_I_t)
    if save_all:
        pm.Deterministic("S_t", S_t)
        pm.Deterministic("I_t", I_t)
        pm.Deterministic("new_E_t", new_E_t)

    if return_all:
        return new_I_t, new_E_t, I_t, S_t
    else:
        return new_I_t
示例#34
0
    def _recog_exprs(self, inpt):
        """Return the exprssions of the recognition model."""
        P = self.parameters.recog

        n_layers = len(self.n_hiddens_recog)
        hidden_to_hiddens = [
            getattr(P, 'hidden_to_hidden_%i' % i) for i in range(n_layers - 1)
        ]
        hidden_biases = [
            getattr(P, 'hidden_bias_%i' % i) for i in range(n_layers)
        ]
        initial_hidden_means_fwd = [
            getattr(P, 'initial_hidden_means_fwd_%i' % i)
            for i in range(n_layers)
        ]
        initial_hidden_vars_fwd = [
            getattr(P, 'initial_hidden_vars_fwd_%i' % i)**2 + 1e-4
            for i in range(n_layers)
        ]
        initial_hidden_means_bwd = [
            getattr(P, 'initial_hidden_means_bwd_%i' % i)
            for i in range(n_layers)
        ]
        initial_hidden_vars_bwd = [
            getattr(P, 'initial_hidden_vars_bwd_%i' % i)**2 + 1e-4
            for i in range(n_layers)
        ]
        recurrents_fwd = [
            getattr(P, 'recurrent_fwd_%i' % i) for i in range(n_layers)
        ]
        recurrents_bwd = [
            getattr(P, 'recurrent_bwd_%i' % i) for i in range(n_layers)
        ]

        p_dropouts = ([P.p_dropout.inpt] + P.p_dropout.hiddens +
                      [P.p_dropout.hidden_to_out])

        # Reparametrize to assert the rates lie in (0.025, 1-0.025).
        p_dropouts = [T.nnet.sigmoid(i) * 0.95 + 0.025 for i in p_dropouts]

        exprs = vpbrnn.exprs(inpt,
                             T.zeros_like(inpt),
                             P.in_to_hidden,
                             hidden_to_hiddens,
                             P.hidden_to_out,
                             hidden_biases, [1 for _ in hidden_biases],
                             initial_hidden_means_fwd,
                             initial_hidden_vars_fwd,
                             initial_hidden_means_bwd,
                             initial_hidden_vars_bwd,
                             recurrents_fwd,
                             recurrents_bwd,
                             P.out_bias,
                             1,
                             self.recog_transfers,
                             self.assumptions.statify_latent,
                             p_dropouts=p_dropouts)
        exprs['inpt'] = inpt

        #to_shortcut = self.exprs['inpt']
        to_shortcut = self.exprs['inpt']

        shortcut = T.concatenate(
            [T.zeros_like(to_shortcut[:1]), to_shortcut[:-1]])

        # Hic sunt dracones.
        # If we do not keep this line, Theano will die with a segfault.
        shortcut_empty = T.set_subtensor(
            T.zeros_like(shortcut)[:, :, :], shortcut)

        exprs['shortcut'] = shortcut_empty

        return exprs
示例#35
0
def zeros_like(x):
    return T.zeros_like(x)
示例#36
0
 def statify_visible(self, X, var=None):
     if var is not None:
         return sigmoid(X, var)
     else:
         return sigmoid(X, T.zeros_like(X))
示例#37
0
 def nll_prior(self, X):
     X_flat = X.flatten()
     nll = -normal_logpdf(X_flat, T.zeros_like(X_flat), T.ones_like(X_flat))
     return nll.reshape(X.shape)
示例#38
0
 def logp(self, value):
     return T.zeros_like(value)
示例#39
0
def jobman(_options, channel=None):

    ################### PARSE INPUT ARGUMENTS #######################
    o = parse_input_arguments(_options,
                              'RNN_theano/rnn_sinsum001/RNN_sumsin.ini')
    ####################### DEFINE THE TASK #########################

    mode = Mode(linker='cvm_nogc', optimizer='fast_run')
    rng = numpy.random.RandomState(o['seed'])
    train_set = sumsin(T=o['task_T'],
                       steps=o['task_steps'],
                       batches=o['task_train_batches'],
                       batch_size=o['task_train_batchsize'],
                       noise=o['task_noise'],
                       rng=rng)

    valid_set = sumsin(T=o['task_T'],
                       steps=o['task_steps'],
                       batches=o['task_valid_batches'],
                       batch_size=o['task_valid_batchsize'],
                       rng=rng)

    test_set = sumsin(T=o['task_T'],
                      steps=o['task_steps'],
                      batches=o['task_test_batches'],
                      batch_size=o['task_test_batchsize'],
                      rng=rng)
    if o['wout_pinv']:
        wout_set = sumsin(T=o['task_T'],
                          steps=o['task_steps'],
                          batches=o['task_wout_batches'],
                          batch_size=o['task_wout_batchsize'],
                          noise=o['task_wout_noise'],
                          rng=rng)

    ###################### DEFINE THE MODEL #########################

    def recurrent_fn(u_t, h_tm1, W_hh, W_ux, W_hy, b):
        x_t = TT.dot(W_ux, u_t)
        h_t = TT.tanh(TT.dot(W_hh, h_tm1) + x_t + b)
        #y_t = TT.dot(W_hy, h_t)
        return h_t  #, y_t

    u = TT.matrix('u')
    if o['error_over_all']:
        t = TT.matrix('t')
    else:
        t = TT.matrix('t')
    h0 = TT.vector('h0')
    b = shared_shape(
        floatX(
            numpy.random.uniform(size=(o['nhid'], ),
                                 low=-o['Wux_properties']['scale'],
                                 high=o['Wux_properties']['scale'])))
    alpha = TT.scalar('alpha')
    lr = TT.scalar('lr')

    W_hh = init(o['nhid'], o['nhid'], 'W_hh', o['Whh_style'],
                o['Whh_properties'], rng)

    W_ux_mask = numpy.ones((o['nhid'], train_set.n_ins),
                           dtype=theano.config.floatX)
    if o['Wux_mask_limit'] > 0:
        W_ux_mask[:o['Wux_mask_limit']] = 0.
    W_ux = init(o['nhid'],
                train_set.n_ins,
                'W_ux',
                o['Wux_style'],
                o['Wux_properties'],
                rng,
                mask=W_ux_mask)

    W_hy = init(train_set.n_outs, o['nhid'], 'W_hy', o['Why_style'],
                o['Why_properties'], rng)
    h, _ = theano.scan(recurrent_fn,
                       sequences=u,
                       outputs_info=h0,
                       non_sequences=[W_hh, W_ux, W_hy, b],
                       name='recurrent_fn',
                       mode=mode)
    y = TT.dot(W_hy, h.T)
    init_h = h.owner.inputs[0].owner.inputs[2]

    #h = theano.printing.Print('h',attrs=('shape',))(h)
    if o['error_over_all']:
        out_err = TT.mean((y - t)**2, axis=1)
        err = out_err.mean()
    else:
        out_err = ((y[-1] - t)**2).mean(axis=1)
        err = out_err.mean()
    # Regularization term
    if o['reg_projection'] == 'h[-1]':
        cost = h[-1].sum()
    elif o['reg_projection'] == 'err':
        cost = err
    elif o['reg_projection'] == 'random':
        trng = TT.shared_randomstreams.RandomStreams(rng.randint(1e6))
        proj = trng.uniform(size=h[-1].shape)
        if o['sum_h2'] > 0:
            proj = TT.join(0, proj[:o['sum_h2']],
                           TT.zeros_like(proj[o['sum_h2']:]))
        cost = TT.sum(proj * h[-1])

    z, gh = TT.grad(cost, [init_h, h])
    z.name = '__z__'
    #import GPUscan.ipdb; GPUscan.ipdb.set_trace()
    #z = z
    zsec = z[:-1] - gh
    if o['sum_h'] > 0:
        z2_1 = TT.sum(z[:, :o['sum_h']]**2, axis=1)
        z2_2 = TT.sum(zsec[:, :o['sum_h']]**2, axis=1)
    else:
        z2_1 = TT.sum(z**2, axis=1)
        z2_2 = TT.sum(zsec**2, axis=1)
    v1 = z2_2
    v2 = z2_1[1:]
    ## ## v2 = theano.printing.Print('v2')(v2)
    # floatX(1e-14)
    ratios = TT.switch(TT.ge(v2, 1e-12), TT.sqrt(v1 / v2), floatX(1))
    norm_0 = TT.ones_like(ratios[0])
    norm_t, _ = theano.scan(lambda x, y: x * y,
                            sequences=ratios,
                            outputs_info=norm_0,
                            name='jacobian_products',
                            mode=mode)
    norm_term = TT.sum(norm_t)
    if o['reg_cost'] == 'product':
        r = abs(TT.log(norm_t)).sum()
    elif o['reg_cost'] == 'each':
        part1 = abs(TT.log(ratios))
        part2 = TT.switch(TT.ge(v2, 1e-12), part1, 1 - v2)
        r = part2.sum()
    elif o['reg_cost'] == 'product2':
        ratios2 = TT.switch(TT.ge(z2[-1], 1e-12), TT.sqrt(z2 / z2[-1]),
                            floatX(1))
        r = abs(TT.log(ratios2)).sum()

    ratios = TT.switch(TT.ge(v2, 1e-12), TT.sqrt(v1 / v2), floatX(1e-12))[::-1]
    norm_0 = TT.ones_like(ratios[0])
    norm_t, _ = theano.scan(lambda x, y: x * y,
                            sequences=ratios,
                            outputs_info=norm_0,
                            name='jacobian_products',
                            mode=mode)
    norm_term = floatX(0.1) + TT.sum(norm_t)
    gu = TT.grad(y[-1].sum(), u)

    if o['opt_alg'] == 'sgd':
        get_updates = lambda p,e, up : ( sgd(p
                                           , e
                                           , lr      = lr
                                           , scale   =\
                                             TT.maximum( my1/norm_term,
                                                        floatX(0.01))
                                           , updates = up)[0]
                                        , [[],[],[TT.constant(0) for x in p]] )
    elif o['opt_alg'] == 'sgd_qn':
        get_updates = lambda p, e, up: sgd_qn(
            p,
            e,
            mylambda=floatX(o['mylambda']),
            t0=floatX(o['t0']),
            skip=floatX(o['skip']),
            scale=TT.maximum(my1 / norm_term, floatX(0.01)),
            lazy=o['lazy'],
            updates=up)

    if o['win_reg']:
        updates, why_extra = get_updates([W_hy], err, {})
        cost = err + alpha * r
        W_ux.name = 'W_ux'
        W_hh.name = 'W_hh'
        b.name = 'b'
        updates, extras = get_updates([W_ux, W_hh, b], cost, updates)
        updates[W_ux] = updates[W_ux] * W_ux_mask
        b_Why = why_extra[2][0]
        b_Wux = extras[2][0]
        b_Whh = extras[2][1]
        b_b = extras[2][2]
    else:
        updates, extras1 = get_updates([W_hy, W_ux], err, {})
        updates[W_ux] = updates[W_ux] * W_ux_mask
        cost = err + alpha * r
        updates, extras2 = get_updates([W_hh, b], cost, updates)
        b_Why = extras1[2][0]
        b_Wux = extras1[2][1]
        b_Whh = extras2[2][0]
        b_b = extras2[2][1]

    nhid = o['nhid']
    train_batchsize = o['task_train_batchsize']
    valid_batchsize = o['task_valid_batchsize']
    test_batchsize = o['task_test_batchsize']
    wout_batchsize = o['task_wout_batchsize']

    train_h0 = shared_shape(floatX(numpy.zeros((nhid, ))))
    valid_h0 = shared_shape(floatX(numpy.zeros((nhid, ))))
    test_h0 = shared_shape(floatX(numpy.zeros((nhid, ))))
    wout_h0 = shared_shape(floatX(numpy.zeros((nhid, ))))
    idx = TT.iscalar('idx')
    train_u, train_t = train_set(idx)
    u.tag.shape = copy.copy(train_u.tag.shape)
    t.tag.shape = copy.copy(train_t.tag.shape)
    train = theano.function([u, t, lr, alpha], [out_err, r, norm_term],
                            updates=updates,
                            mode=mode,
                            givens={h0: train_h0})

    valid_u, valid_t = valid_set(idx)
    u.tag.shape = copy.copy(valid_u.tag.shape)
    t.tag.shape = copy.copy(valid_t.tag.shape)
    valid = theano.function([u, t], [out_err, r, norm_term],
                            mode=mode,
                            givens={h0: valid_h0})

    test_u, test_t = test_set(idx)
    u.tag.shape = copy.copy(test_u.tag.shape)
    t.tag.shape = copy.copy(test_t.tag.shape)
    test = theano.function([u, t], [
        out_err, r, norm_term, W_hh, W_ux, W_hy, b, z, y, h, u, gu, t, b_Whh,
        b_Wux, b_Why, b_b, zsec, gh
    ],
                           mode=mode,
                           givens={h0: test_h0})
    if o['wout_pinv']:
        wout_u, wout_t = wout_set.get_whole_tensors()

        def wiener_hopf_fn(u_t, t_t, H_tm1, Y_tm1, W_hh, W_ux, b, h0):
            def recurrent_fn(u_t, h_tm1, W_hh, W_ux, b):
                x_t = TT.dot(W_ux, u_t)
                h_t = TT.tanh(TT.dot(W_hh, h_tm1) + x_t + b)
                return h_t

            h_t, _ = theano.scan(recurrent_fn,
                                 sequences=u_t,
                                 outputs_info=h0,
                                 non_sequences=[W_hh, W_ux, b],
                                 name='recurrent_fn',
                                 mode=mode)
            H_t = H_tm1 + TT.dot(h_t[-1], h_t[-1].T)
            Y_t = Y_tm1 + TT.dot(h_t[-1], t_t.T)
            return H_t, Y_t

        H_0 = shared_shape(numpy.zeros((o['nhid'], o['nhid']),
                                       dtype=theano.config.floatX),
                           name='H0')
        Y_0 = shared_shape(numpy.zeros((o['nhid'], 1),
                                       dtype=theano.config.floatX),
                           name='Y0')
        all_u = TT.tensor4('whole_u')
        all_t = TT.tensor3('whole_t')
        [H, Y], _ = theano.scan(
            wiener_hopf_fn,
            sequences=[all_u, all_t],
            outputs_info=[H_0, Y_0],
            non_sequences=[W_hh, W_ux, TT.shape_padright(b), h0],
            name='wiener_hopf_fn',
            mode=mode)
        length = TT.cast(all_u.shape[0] * all_u.shape[3],
                         dtype=theano.config.floatX)
        H = H[-1] / length
        Y = Y[-1] / length
        H = H + floatX(o['wiener_lambda']) * TT.eye(o['nhid'])
        W_hy_solve = theano_linalg.solve(H, Y).T
        wout = theano.function([idx], [],
                               mode=mode,
                               updates={W_hy: W_hy_solve},
                               givens={
                                   all_u: wout_u,
                                   all_t: wout_t,
                                   h0: wout_h0
                               })
    '''
    theano.printing.pydotprint(train, 'train.png', high_contrast=True,
                               with_ids= True)
    for idx,node in enumerate(train.maker.env.toposort()):
        if node.op.__class__.__name__ == 'Scan':
            theano.printing.pydotprint(node.op.fn,
                                       ('train%d_'%idx)+node.op.name,
                                       high_contrast = True,
                                       with_ids = True)

    theano.printing.pydotprint(train, 'valid.png', high_contrast=True,
                              with_ids = True)
    for idx,node in enumerate(train.maker.env.toposort()):
        if node.op.__class__.__name__ == 'Scan':
            theano.printing.pydotprint(node.op.fn,
                                       ('valid%d_'%idx)+node.op.name,
                                       high_contrast = True,
                                      with_ids = True)
    theano.printing.pydotprint(train, 'test.png', high_contrast=True,
                              with_ids = True)
    for idx,node in enumerate(train.maker.env.toposort()):
        if node.op.__class__.__name__ == 'Scan':
            theano.printing.pydotprint(node.op.fn,
                                       ('test%d_'%idx)+node.op.name,
                                       high_contrast = True,
                                      with_ids = True)
    if o['wout_pinv']:
        theano.printing.pydotprint(train, 'wout.png', high_contrast=True,
                                  with_ids = True)
        for idx,node in enumerate(train.maker.env.toposort()):
            if node.op.__class__.__name__ == 'Scan':
                theano.printing.pydotprint(node.op.fn,
                                       ('wout%d_'%idx)+node.op.name,
                                       high_contrast = True,
                                          with_ids= True)

    '''

    #import GPUscan.ipdb; GPUscan.ipdb.set_trace()
    #rval = valid(valid_set.data_u[0],valid_set.data_t[0])

    #################### DEFINE THE MAIN LOOP #######################

    data = {}
    fix_len = o['max_storage_numpy']  #int(o['NN']/o['small_step'])
    avg_train_err = numpy.zeros((o['small_step'], train_set.n_outs))
    avg_train_reg = numpy.zeros((o['small_step'], ))
    avg_train_norm = numpy.zeros((o['small_step'], ))
    avg_valid_err = numpy.zeros((o['small_step'], train_set.n_outs))
    avg_valid_reg = numpy.zeros((o['small_step'], ))
    avg_valid_norm = numpy.zeros((o['small_step'], ))

    data['options'] = o
    data['train_err'] = -1 * numpy.ones((fix_len, train_set.n_outs))
    data['valid_err'] = -1 * numpy.ones((fix_len, train_set.n_outs))
    data['train_reg'] = -1 * numpy.ones((fix_len, ))
    data['valid_reg'] = -1 * numpy.ones((fix_len, ))
    data['train_norm'] = numpy.zeros((fix_len, ))
    data['valid_norm'] = numpy.zeros((fix_len, ))

    data['test_err'] = [None] * o['max_storage']
    data['test_idx'] = [None] * o['max_storage']
    data['test_reg'] = [None] * o['max_storage']
    data['test_norm'] = [None] * o['max_storage']
    data['y'] = [None] * o['max_storage']
    data['z'] = [None] * o['max_storage']
    data['t'] = [None] * o['max_storage']
    data['h'] = [None] * o['max_storage']
    data['u'] = [None] * o['max_storage']
    data['gu'] = [None] * o['max_storage']
    data['W_hh'] = [None] * o['max_storage']
    data['W_ux'] = [None] * o['max_storage']
    data['W_hy'] = [None] * o['max_storage']
    data['b'] = [None] * o['max_storage']
    data['b_ux'] = [None] * o['max_storage']
    data['b_hy'] = [None] * o['max_storage']
    data['b_hh'] = [None] * o['max_storage']
    data['b_b'] = [None] * o['max_storage']
    data['stuff'] = []
    storage_exceeded = False
    stop = False

    old_rval = numpy.inf
    patience = o['patience']
    n_train = o['task_train_batches']
    n_valid = o['task_valid_batches']
    n_test = o['task_test_batches']
    n_test_runs = 0
    test_pos = 0

    valid_set.refresh()
    test_set.refresh()
    kdx = 0
    lr_v = floatX(o['lr'])
    alpha_v = floatX(o['alpha'])
    lr_f = 1
    if o['lr_scheme']:
        lr_f = o['lr_scheme'][1] / (o['NN'] - o['lr_scheme'][0])
    alpha_r = 1
    if o['alpha_scheme']:
        alpha_r = float(o['alpha_scheme'][1] - o['alpha_scheme'][0])

    st = time.time()
    if channel:
        try:
            channel.save()
        except:
            pass
    for idx in xrange(int(o['NN'])):
        if o['lr_scheme'] and idx > o['lr_scheme'][0]:
            lr_v = floatX(o['lr'] * 1. / (1. +
                                          (idx - o['lr_scheme'][0]) * lr_f))
        if o['alpha_scheme']:
            if idx < o['alpha_scheme'][0]:
                alpha_v = floatX(0)
            elif idx < o['alpha_scheme'][1]:
                pos = 2. * (idx - o['alpha_scheme'][0]) / alpha_r - 1.
                alpha_v = floatX(numpy.exp(-pos**2 / 0.2) * o['alpha'])
            else:
                alpha_v = floatX(0)

        jdx = idx % o['small_step']
        avg_train_err[jdx, :] = 0
        avg_train_reg[jdx] = 0
        avg_train_norm[jdx] = 0

        avg_valid_err[jdx, :] = 0
        avg_valid_reg[jdx] = 0
        avg_valid_norm[jdx] = 0

        if o['wout_pinv'] and (idx % o['test_step'] == 0):
            wout_set.refresh()
            print(
                '* Re-computing W_hy using closed-form '
                'regularized wiener hopf formula')
            st_wout = time.time()
            wout(0)
            ed_wout = time.time()
            print '** It took ', ed_wout - st_wout, 'secs'
            print '** Average weight', abs(W_hy.get_value(borrow=True)).mean()

        for k in xrange(o['task_train_batches']):
            s, t = train_set.get_slice()
            rval = train(s, t, lr_v, alpha_v)
            print '[',idx,'/',patience,'][',k,'/',n_train,'][train]', rval[0].mean(), \
                    rval[1], rval[2], numpy.max([(1./rval[2]), 0.01])*lr_v, alpha_v
            avg_train_err[jdx, :] += rval[0]
            avg_train_reg[jdx] += rval[1]
            avg_train_norm[jdx] += rval[2]
        print '**Epoch took', time.time() - st, 'secs'
        avg_train_err[jdx] /= n_train
        avg_train_reg[jdx] /= n_train
        avg_train_norm[jdx] /= n_train
        st = time.time()

        for k in xrange(n_valid):
            rval = valid(*valid_set.get_slice())
            print '[',idx,'/',patience,'][',k,'/',n_valid,'][valid]', rval[0].mean(), \
                    rval[1], rval[2]
            avg_valid_err[jdx] += rval[0]
            avg_valid_reg[jdx] += rval[1]
            avg_valid_norm[jdx] += rval[2]

        avg_valid_err[jdx] /= n_valid
        avg_valid_reg[jdx] /= n_valid
        avg_valid_norm[jdx] /= n_valid
        if idx >= o['small_step'] and idx % o['small_step'] == 0:
            kdx += 1
            if kdx >= o['max_storage_numpy']:
                kdx = o['max_storage_numpy'] // 3
                storage_exceeded = True

            data['steps'] = idx
            data['kdx'] = kdx
            data['storage_exceeded'] = storage_exceeded
            data['train_err'][kdx] = avg_train_err.mean()
            data['valid_err'][kdx] = avg_valid_err.mean()
            data['train_reg'][kdx] = avg_train_reg.mean()
            data['valid_reg'][kdx] = avg_valid_reg.mean()
            data['train_norm'][kdx] = avg_train_norm.mean()
            data['valid_norm'][kdx] = avg_valid_norm.mean()
            if channel:
                try:
                    _options['trainerr'] = data['train_err'][kdx].mean()
                    _options['maxtrainerr'] = data['train_err'][kdx].max()
                    _options['trainreg'] = data['train_reg'][kdx]
                    _options['trainnorm'] = data['train_norm'][kdx]
                    _options['validerr'] = data['valid_err'][kdx].mean()
                    _options['maxvaliderr'] = data['valid_err'][kdx].max()
                    _options['validreg'] = data['valid_reg'][kdx]
                    _options['validnorm'] = data['valid_norm'][kdx]
                    _options['steps'] = idx
                    _options['patience'] = patience
                    channel.save()
                except:
                    pass

                test_err = []
                test_reg = []
                test_norm = []

                for k in xrange(n_test):
                    rval = test(*test_set.get_slice())
                    print '[',idx,'][',k,'/',n_test,'][test]',rval[0].mean()\
                        , rval[1], rval[2]
                    test_err += [rval[0]]
                    test_reg += [rval[1]]
                    test_norm += [rval[2]]
                    test_z = rval[7][:, :]
                    test_y = rval[8][:, :]
                    test_h = rval[9][:, :]
                    test_u = rval[10][:, :]
                    test_gu = rval[11][:, :]
                    test_t = rval[12][:, :]
                data['test_idx'][test_pos] = idx
                data['test_pos'] = test_pos
                data['y'][test_pos] = test_y
                data['z'][test_pos] = test_z
                data['t'][test_pos] = test_t
                data['h'][test_pos] = test_h
                data['u'][test_pos] = test_u
                data['gu'][test_pos] = test_gu
                data['test_err'][test_pos] = test_err
                data['test_reg'][test_pos] = test_reg
                data['test_norm'][test_pos] = test_norm
                data['W_hh'][test_pos] = rval[3]
                data['W_ux'][test_pos] = rval[4]
                data['W_hy'][test_pos] = rval[5]
                data['b'][test_pos] = rval[6]
                data['b_hh'][test_pos] = rval[13]
                data['b_ux'][test_pos] = rval[14]
                data['b_hy'][test_pos] = rval[15]
                data['b_b'][test_pos] = rval[16]
                data['stuff'] += [(rval[17], rval[18])]
            cPickle.dump(
                data,
                open(
                    os.path.join(configs.results_folder(), o['path'],
                                 '%s_backup.pkl' % o['name']), 'wb'))

        print '** ', avg_valid_err[jdx].mean(), ' < ', old_rval, ' ? '
        if avg_valid_err[jdx].mean() < old_rval:

            patience += o['patience_incr']
            if avg_valid_err[jdx].mean() < old_rval:

                test_err = []
                test_reg = []
                test_norm = []

                for k in xrange(n_test):
                    rval = test(*test_set.get_slice())
                    print '[',idx,'][',k,'/',n_test,'][test]',rval[0].mean()\
                        , rval[1], rval[2]
                    test_err += [rval[0]]
                    test_reg += [rval[1]]
                    test_norm += [rval[2]]
                    test_z = rval[7][:, :]
                    test_y = rval[8][:, :]
                    test_h = rval[9][:, :]
                    test_u = rval[10][:, :]
                    test_gu = rval[11][:, :]
                    test_t = rval[12][:, :]
                data['test_idx'][test_pos] = idx
                data['test_pos'] = test_pos
                data['y'][test_pos] = test_y
                data['z'][test_pos] = test_z
                data['t'][test_pos] = test_t
                data['h'][test_pos] = test_h
                data['u'][test_pos] = test_u
                data['gu'][test_pos] = test_gu
                data['test_err'][test_pos] = test_err
                data['test_reg'][test_pos] = test_reg
                data['test_norm'][test_pos] = test_norm
                data['W_hh'][test_pos] = rval[3]
                data['W_ux'][test_pos] = rval[4]
                data['W_hy'][test_pos] = rval[5]
                data['b'][test_pos] = rval[6]
                data['b_hh'][test_pos] = rval[13]
                data['b_ux'][test_pos] = rval[14]
                data['b_hy'][test_pos] = rval[15]
                data['b_b'][test_pos] = rval[16]
                data['stuff'] += [(rval[17], rval[18])]

                cPickle.dump(
                    data,
                    open(
                        os.path.join(configs.results_folder(), o['path'],
                                     '%s.pkl' % o['name']), 'wb'))
                n_test_runs += 1
                test_pos += 1
                if test_pos >= o['max_storage']:
                    test_pos = test_pos - o['go_back']
                if numpy.mean(test_err) < 5e-5:
                    patience = idx - 5
                    break

            old_rval = avg_valid_err[jdx].mean()
        if idx > patience:
            break
示例#40
0
def train(random_seed=1234,
          dim_word=256, # word vector dimensionality
          ctx_dim=-1, # context vector dimensionality, auto set
          dim=1000, # the number of LSTM units
          n_layers_out=1,
          n_layers_init=1,
          encoder='none',
          encoder_dim=100,
          prev2out=False,
          ctx2out=False,
          patience=10,
          max_epochs=5000,
          dispFreq=100,
          decay_c=0.,
          alpha_c=0.,
          alpha_entropy_r=0.,
          lrate=0.01,
          selector=False,
          n_words=100000,
          maxlen=100, # maximum length of the description
          optimizer='adadelta',
          clip_c=2.,
          batch_size = 64,
          valid_batch_size = 64,
          save_model_dir='/data/lisatmp3/yaoli/exp/capgen_vid/attention/test/',
          validFreq=10,
          saveFreq=10, # save the parameters after every saveFreq updates
          sampleFreq=10, # generate some samples after every sampleFreq updates
          metric='blue',
          dataset='youtube2text',
          video_feature='googlenet',
          use_dropout=False,
          reload_=False,
          from_dir=None,
          K1=28,
          K2=10,
          OutOf=240,
          verbose=True,
          debug=True
          ):
    rng_numpy, rng_theano = utils.get_two_rngs()

    model_options = locals().copy()
    if 'self' in model_options:
        del model_options['self']
    with open('%smodel_options.pkl'%save_model_dir, 'wb') as f:
        pkl.dump(model_options, f)

    # instance model
    layers = Layers()
    model = Model()

    print 'Loading data'
    engine = data_engine.Movie2Caption('attention', dataset,
                                       video_feature,
                                       batch_size, valid_batch_size,
                                       maxlen, n_words,
                                       K1, K2, OutOf)
    model_options['ctx_dim'] = engine.ctx_dim
    model_options['n_words'] = engine.n_words
    print 'n_words:', model_options['n_words']

    # set test values, for debugging
    idx = engine.kf_train[0]
    [x_tv, mask_tv,
     ctx_tv, ctx_mask_tv,
     ctx_tv_c, ctx_mask_tv_c] = data_engine.prepare_data(
        engine, [engine.train[index] for index in idx])

    print 'init params'
    t0 = time.time()
    params = model.init_params(model_options)

    # reloading
    if reload_:
        model_saved = from_dir+'/model_best_so_far.npz'
        assert os.path.isfile(model_saved)
        print "Reloading model params..."
        params = utils.load_params(model_saved, params)

    tparams = utils.init_tparams(params)

    trng, use_noise, \
          x, mask, ctx, mask_ctx, ctx_c, mask_ctx_c, \
          cost, extra = \
          model.build_model(tparams, model_options)
    print 'build model done!'
    alphas = extra[1]
    alphas_c = extra[2]
    betas = extra[3]
    betas_c = extra[4]
    print 'buliding sampler'
    f_init, f_next = model.build_sampler(tparams, model_options, use_noise, trng)
    # before any regularizer
    print 'building f_log_probs'
    f_log_probs = theano.function([x, mask, ctx, mask_ctx, ctx_c, mask_ctx_c], -cost,
                                  profile=False, on_unused_input='ignore')

    cost = cost.mean()
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    if alpha_c > 0.:
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * ((1. - alphas.sum(0)) ** 2).sum(-1).mean()
        cost += alpha_reg
        alpha_reg_c = alpha_c * ((1. - alphas_c.sum(0)) ** 2).sum(-1).mean()
        cost += alpha_reg_c

    if alpha_entropy_r > 0:
        alpha_entropy_r = theano.shared(numpy.float32(alpha_entropy_r),
                                        name='alpha_entropy_r')
        alpha_reg_2 = alpha_entropy_r * (-tensor.sum(alphas *
                    tensor.log(alphas+1e-8),axis=-1)).sum(-1).mean()
        cost += alpha_reg_2
    else:
        alpha_reg_2 = tensor.zeros_like(cost)
    print 'building f_alpha'
    f_alpha = theano.function([x, mask, ctx, ctx_c, mask_ctx, mask_ctx_c],
                              [alphas, alphas_c, betas, betas_c],
                              name='f_alpha',
                              on_unused_input='ignore')

    print 'compute grad'
    grads = tensor.grad(cost, wrt=utils.itemlist(tparams))
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(tensor.switch(g2 > (clip_c**2),
                                           g / tensor.sqrt(g2) * clip_c,
                                           g))
        grads = new_grads

    lr = tensor.scalar(name='lr')
    print 'build train fns'
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads,
                                              [x, mask, ctx, ctx_c, mask_ctx, mask_ctx_c], cost,
                                              extra + grads)

    print 'compilation took %.4f sec'%(time.time()-t0)
    print 'Optimization'

    history_errs = []
    # reload history
    if reload_:
        print 'loading history error...'
        history_errs = numpy.load(
            from_dir+'model_best_so_far.npz')['history_errs'].tolist()

    bad_counter = 0

    processes = None
    queue = None
    rqueue = None
    shared_params = None

    uidx = 0
    uidx_best_blue = 0
    uidx_best_valid_err = 0
    estop = False
    best_p = utils.unzip(tparams)
    best_blue_valid = 0
    best_valid_err = 999
    alphas_ratio = []
    for eidx in xrange(max_epochs):
        n_samples = 0
        train_costs = []
        grads_record = []
        print 'Epoch ', eidx
        for idx in engine.kf_train:
            tags = [engine.train[index] for index in idx]
            n_samples += len(tags)
            uidx += 1
            use_noise.set_value(1.)

            pd_start = time.time()
            x, mask, ctx, ctx_mask, ctx_c, ctx_mask_c = data_engine.prepare_data(
                engine, tags)
            pd_duration = time.time() - pd_start
            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                continue

            ud_start = time.time()
            rvals = f_grad_shared(x, mask, ctx, ctx_c, ctx_mask, ctx_mask_c)
            cost = rvals[0]
            probs = rvals[1]
            alphas = rvals[2]
            alphas_c = rvals[3]
            betas = rvals[4]
            betas_c = rvals[5]
            grads = rvals[6:]
            grads, NaN_keys = utils.grad_nan_report(grads, tparams)
            if len(grads_record) >= 5:
                del grads_record[0]
            grads_record.append(grads)
            if NaN_keys != []:
                print 'grads contain NaN'
                import pdb; pdb.set_trace()
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected in cost'
                import pdb; pdb.set_trace()
            # update params
            f_update(lrate)
            ud_duration = time.time() - ud_start

            if eidx == 0:
                train_error = cost
            else:
                train_error = train_error * 0.95 + cost * 0.05
            train_costs.append(cost)

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Train cost mean so far', \
                  train_error, 'fetching data time spent (sec)', pd_duration, \
                  'update time spent (sec)', ud_duration, 'save_dir', save_model_dir
                alphas, alphas_c, betas, betas_c = f_alpha(x, mask, ctx, ctx_c, ctx_mask, ctx_mask_c)
                counts = mask.sum(0)
                betas_mean = (betas * mask).sum(0) / counts
                betas_mean = betas_mean.mean()
                betas_mean_c = (betas_c * mask).sum(0) / counts
                betas_mean_c = betas_mean_c.mean()
                print 'alpha ratio %.3f, betas mean %.3f'%(
                    alphas.min(-1).mean() / (alphas.max(-1)).mean(), betas_mean)
                l = 0
                for vv in x[:, 0]:
                    if vv == 0:
                        break
                    if vv in engine.word_idict:
                        print '(', numpy.round(betas[l, 0], 3), ')', engine.word_idict[vv],
                    else:
                        print '(', numpy.round(betas[l, 0], 3), ')', 'UNK',
                    l += 1
                print '(', numpy.round(betas[l, 0], 3), ')'

            if numpy.mod(uidx, saveFreq) == 0:
                pass

            if numpy.mod(uidx, sampleFreq) == 0:
                use_noise.set_value(0.)
                print '------------- sampling from train ----------'
                x_s = x
                mask_s = mask
                ctx_s = ctx
                ctx_s_c = ctx_c
                ctx_mask_s = ctx_mask
                ctx_mask_s_c = ctx_mask_c
                model.sample_execute(engine, model_options, tparams,
                                          f_init, f_next, x_s, ctx_s, ctx_s_c, ctx_mask_s, ctx_mask_s_c, trng)
                print '------------- sampling from valid ----------'
                idx = engine.kf_valid[numpy.random.randint(1, len(engine.kf_valid) - 1)]
                tags = [engine.valid[index] for index in idx]
                x_s, mask_s, ctx_s, mask_ctx_s, ctx_s_c, mask_ctx_s_c = data_engine.prepare_data(engine, tags)
                model.sample_execute(engine, model_options, tparams,
                                          f_init, f_next, x_s, ctx_s, ctx_s_c, mask_ctx_s, mask_ctx_s_c, trng)

            if validFreq != -1 and numpy.mod(uidx, validFreq) == 0:
                t0_valid = time.time()
                alphas, alphas_c, _, _ = f_alpha(x, mask, ctx, ctx_c, ctx_mask, ctx_mask_c)
                ratio = alphas.min(-1).mean()/(alphas.max(-1)).mean()
                alphas_ratio.append(ratio)
                numpy.savetxt(save_model_dir+'alpha_ratio.txt',alphas_ratio)

                current_params = utils.unzip(tparams)
                numpy.savez(
                         save_model_dir+'model_current.npz',
                         history_errs=history_errs, **current_params)

                use_noise.set_value(0.)
                train_err = -1
                train_perp = -1
                valid_err = -1
                valid_perp = -1
                test_err = -1
                test_perp = -1
                if not debug:
                    # first compute train cost
                    if 0:
                        print 'computing cost on trainset'
                        train_err, train_perp = model.pred_probs(
                                engine, 'train', f_log_probs,
                                verbose=model_options['verbose'])
                    else:
                        train_err = 0.
                        train_perp = 0.
                    if 1:
                        print 'validating...'
                        valid_err, valid_perp = model.pred_probs(
                            engine, 'valid', f_log_probs,
                            verbose=model_options['verbose'],
                            )
                    else:
                        valid_err = 0.
                        valid_perp = 0.
                    if 1:
                        print 'testing...'
                        test_err, test_perp = model.pred_probs(
                            engine, 'test', f_log_probs,
                            verbose=model_options['verbose']
                            )
                    else:
                        test_err = 0.
                        test_perp = 0.

                mean_ranking = 0
                blue_t0 = time.time()
                scores, processes, queue, rqueue, shared_params = \
                    metrics.compute_score(
                    model_type='attention',
                    model_archive=current_params,
                    options=model_options,
                    engine=engine,
                    save_dir=save_model_dir,
                    beam=5, n_process=5,
                    whichset='both',
                    on_cpu=False,
                    processes=processes, queue=queue, rqueue=rqueue,
                    shared_params=shared_params, metric=metric,
                    one_time=False,
                    f_init=f_init, f_next=f_next, model=model
                    )
                '''
                 {'blue': {'test': [-1], 'valid': [77.7, 60.5, 48.7, 38.5, 38.3]},
                 'alternative_valid': {'Bleu_3': 0.40702270203174923,
                 'Bleu_4': 0.29276570520368456,
                 'CIDEr': 0.25247168210607884,
                 'Bleu_2': 0.529069629270047,
                 'Bleu_1': 0.6804308797115253,
                 'ROUGE_L': 0.51083584331688392},
                 'meteor': {'test': [-1], 'valid': [0.282787550236724]}}
                '''

                valid_B1 = scores['valid']['Bleu_1']
                valid_B2 = scores['valid']['Bleu_2']
                valid_B3 = scores['valid']['Bleu_3']
                valid_B4 = scores['valid']['Bleu_4']
                valid_Rouge = scores['valid']['ROUGE_L']
                valid_Cider = scores['valid']['CIDEr']
                valid_meteor = scores['valid']['METEOR']
                test_B1 = scores['test']['Bleu_1']
                test_B2 = scores['test']['Bleu_2']
                test_B3 = scores['test']['Bleu_3']
                test_B4 = scores['test']['Bleu_4']
                test_Rouge = scores['test']['ROUGE_L']
                test_Cider = scores['test']['CIDEr']
                test_meteor = scores['test']['METEOR']
                print 'computing meteor/blue score used %.4f sec, '\
                  'blue score: %.1f, meteor score: %.1f'%(
                time.time()-blue_t0, valid_B4, valid_meteor)
                history_errs.append([eidx, uidx, train_err, train_perp,
                                     valid_perp, test_perp,
                                     valid_err, test_err,
                                     valid_B1, valid_B2, valid_B3,
                                     valid_B4, valid_meteor, valid_Rouge, valid_Cider,
                                     test_B1, test_B2, test_B3,
                                     test_B4, test_meteor, test_Rouge, test_Cider])
                numpy.savetxt(save_model_dir+'train_valid_test.txt',
                              history_errs, fmt='%.3f')
                print 'save validation results to %s'%save_model_dir
                # save best model according to the best blue or meteor
                if len(history_errs) > 1 and \
                  valid_B4 > numpy.array(history_errs)[:-1,11].max():
                    print 'Saving to %s...'%save_model_dir,
                    numpy.savez(
                        save_model_dir+'model_best_blue_or_meteor.npz',
                        history_errs=history_errs, **best_p)
                if len(history_errs) > 1 and \
                  valid_err < numpy.array(history_errs)[:-1,6].min():
                    best_p = utils.unzip(tparams)
                    bad_counter = 0
                    best_valid_err = valid_err
                    uidx_best_valid_err = uidx

                    print 'Saving to %s...'%save_model_dir,
                    numpy.savez(
                        save_model_dir+'model_best_so_far.npz',
                        history_errs=history_errs, **best_p)
                    with open('%smodel_options.pkl'%save_model_dir, 'wb') as f:
                        pkl.dump(model_options, f)
                    print 'Done'
                elif len(history_errs) > 1 and \
                    valid_err >= numpy.array(history_errs)[:-1,6].min():
                    bad_counter += 1
                    print 'history best ',numpy.array(history_errs)[:,6].min()
                    print 'bad_counter ',bad_counter
                    print 'patience ',patience
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if test_B4>0.52 and test_meteor>0.32:
                    print 'Saving to %s...'%save_model_dir,
                    numpy.savez(
                        save_model_dir+'model_'+str(uidx)+'.npz',
                        history_errs=history_errs, **current_params)

                print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \
                  'best valid err so far',best_valid_err
                #print 'valid took %.2f sec'%(time.time() - t0_valid)
                # end of validatioin
            if debug:
                break
        if estop:
            break
        if debug:
            break

        # end for loop over minibatches
        print 'This epoch has seen %d samples, train cost %.2f'%(
            n_samples, numpy.mean(train_costs))
    # end for loop over epochs
    print 'Optimization ended.'
    if best_p is not None:
        utils.zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = 0
    test_err = 0
    if not debug:
        #if valid:
        valid_err, valid_perp = model.pred_probs(
            engine, 'valid', f_log_probs,
            verbose=model_options['verbose'])
        #if test:
        #test_err, test_perp = self.pred_probs(
        #    'test', f_log_probs,
        #    verbose=model_options['verbose'])


    print 'stopped at epoch %d, minibatch %d, '\
      'curent Train %.2f, current Valid %.2f, current Test %.2f '%(
          eidx,uidx,numpy.mean(train_err),numpy.mean(valid_err),numpy.mean(test_err))
    params = copy.copy(best_p)
    numpy.savez(save_model_dir+'model_best.npz',
                train_err=train_err,
                valid_err=valid_err, test_err=test_err, history_errs=history_errs,
                **params)

    if history_errs != []:
        history = numpy.asarray(history_errs)
        best_valid_idx = history[:,6].argmin()
        numpy.savetxt(save_model_dir+'train_valid_test.txt', history, fmt='%.4f')
        print 'final best exp ', history[best_valid_idx]

    return train_err, valid_err, test_err
示例#41
0
def SIR(
    lambda_t_log,
    pr_I_begin=100,
    pr_median_mu=1 / 8,
    pr_sigma_mu=0.2,
    model=None,
    return_all=False,
    save_all=False,
):
    r"""
        Implements the susceptible-infected-recovered model.

        .. math::

            I_{new}(t) &= \lambda_t I(t-1)  \frac{S(t-1)}{N}   \\
            S(t) &= S(t-1) - I_{new}(t)  \\
            I(t) &= I(t-1) + I_{new}(t) - \mu  I(t)

        The prior distribution of the recovery rate :math:`\mu` is set to
        :math:`LogNormal(\text{log(pr\_median\_mu)), pr\_sigma\_mu})`. And the prior distribution of
        :math:`I(0)` to :math:`HalfCauchy(\text{pr\_beta\_I\_begin})`

        Parameters
        ----------
        lambda_t_log : :class:`~theano.tensor.TensorVariable`
            time series of the logarithm of the spreading rate, 1 or 2-dimensional. If 2-dimensional the first
            dimension is time.

        pr_I_begin : float or array_like or :class:`~theano.tensor.TensorVariable`
            Prior beta of the Half-Cauchy distribution of :math:`I(0)`.

        pr_median_mu : float or array_like
            Prior for the median of the lognormal distrubution of the recovery rate :math:`\mu`.

        pr_sigma_mu : float or array_like
            Prior for the sigma of the lognormal distribution of recovery rate :math:`\mu`.

        model : :class:`Cov19Model`
            if none, it is retrieved from the context

        return_all : bool
            if True, returns ``new_I_t``, ``I_t``, ``S_t`` otherwise returns only ``new_I_t``
        save_all : bool
            if True, saves ``new_I_t``, ``I_t``, ``S_t`` in the trace, otherwise it saves only ``new_I_t``

        Returns
        -------

        new_I_t : :class:`~theano.tensor.TensorVariable`
            time series of the number daily newly infected persons.
        I_t : :class:`~theano.tensor.TensorVariable`
            time series of the infected (if return_all set to True)
        S_t : :class:`~theano.tensor.TensorVariable`
            time series of the susceptible (if return_all set to True)

    """
    model = modelcontext(model)

    # Build prior distributions:
    mu = pm.Lognormal(name="mu", mu=np.log(pr_median_mu), sigma=pr_sigma_mu)

    # Total number of people in population
    N = model.N_population

    # Number of regions as tuple of int
    num_regions = () if model.sim_ndim == 1 else model.sim_shape[1]

    # Prior distributions of starting populations (infectious, susceptibles)
    if isinstance(pr_I_begin, tt.TensorVariable):
        I_begin = pr_I_begin
    else:
        I_begin = pm.HalfCauchy(name="I_begin",
                                beta=pr_I_begin,
                                shape=num_regions)

    S_begin = N - I_begin

    lambda_t = tt.exp(lambda_t_log)
    new_I_0 = tt.zeros_like(I_begin)

    # Runs SIR model:
    def next_day(lambda_t, S_t, I_t, _, mu, N):
        new_I_t = lambda_t / N * I_t * S_t
        S_t = S_t - new_I_t
        I_t = I_t + new_I_t - mu * I_t
        I_t = tt.clip(I_t, -1, N)  # for stability
        S_t = tt.clip(S_t, 0, N)
        return S_t, I_t, new_I_t

    # theano scan returns two tuples, first one containing a time series of
    # what we give in outputs_info : S, I, new_I
    outputs, _ = theano.scan(
        fn=next_day,
        sequences=[lambda_t],
        outputs_info=[S_begin, I_begin, new_I_0],
        non_sequences=[mu, N],
    )
    S_t, I_t, new_I_t = outputs
    pm.Deterministic("new_I_t", new_I_t)
    if save_all:
        pm.Deterministic("S_t", S_t)
        pm.Deterministic("I_t", I_t)

    if return_all:
        return new_I_t, I_t, S_t
    else:
        return new_I_t
示例#42
0
 def grad(self, inputs, output_grads):
     return [tensor.zeros_like(ii, dtype=theano.config.floatX) for ii in inputs]
示例#43
0
 def make_Q(i, j, tps, Q, reward, v):
     Q_template = T.zeros_like(Q)
     tp = transition_probabilities[i, j, :]
     return T.set_subtensor(Q_template[i, j],
                            tp.dot(reward + discount * v)), {}
示例#44
0
    def __init__(self,
                 voca_size,
                 hidden_size,
                 lstm_layers_num,
                 learning_rate=0.2):
        self.voca_size = voca_size
        self.hidden_size = hidden_size
        self.lstm_layers_num = lstm_layers_num
        self.learning_rate = learning_rate
        self._train = None
        self._utter = None
        self.params = []
        self.encoder_lstm_layers = []
        self.decoder_lstm_layers = []
        self.hos = []
        self.Cos = []

        encoderInputs, encoderMask = tensor.imatrices(2)
        decoderInputs, decoderMask, decoderTarget = tensor.imatrices(3)

        self.lookuptable = theano.shared(name="Encoder LookUpTable",
                                         value=utils.init_norm(
                                             self.voca_size, self.hidden_size),
                                         borrow=True)
        self.linear = theano.shared(name="Linear",
                                    value=utils.init_norm(
                                        self.hidden_size, self.voca_size),
                                    borrow=True)
        self.params += [self.lookuptable, self.linear]  #concatenate

        #(max_sent_size, batch_size, hidden_size)
        state_below = self.lookuptable[encoderInputs.flatten()].reshape(
            (encoderInputs.shape[0], encoderInputs.shape[1], self.hidden_size))
        for _ in range(self.lstm_layers_num):
            enclstm = LSTM(self.hidden_size)
            self.encoder_lstm_layers += enclstm,  #append
            self.params += enclstm.params  #concatenate
            hs, Cs = enclstm.forward(state_below, encoderMask)
            self.hos += hs[-1],
            self.Cos += Cs[-1],
            state_below = hs

        state_below = self.lookuptable[decoderInputs.flatten()].reshape(
            (decoderInputs.shape[0], decoderInputs.shape[1], self.hidden_size))
        for i in range(self.lstm_layers_num):
            declstm = LSTM(self.hidden_size)
            self.decoder_lstm_layers += declstm,  #append
            self.params += declstm.params  #concatenate
            ho, Co = self.hos[i], self.Cos[i]
            state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co)
        decoder_lstm_outputs = state_below

        ei, em, di, dm, dt = tensor.imatrices(5)  #place holders
        #####################################################
        #####################################################
        linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear)
        softmax_outputs, updates = theano.scan(
            fn=lambda x: tensor.nnet.softmax(x),
            sequences=[linear_outputs],
        )

        def _NLL(pred, y, m):
            return -m * tensor.log(pred[tensor.arange(decoderInputs.shape[1]),
                                        y])

        costs, updates = theano.scan(
            fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask])
        loss = costs.sum() / decoderMask.sum()

        gparams = [tensor.grad(loss, param) for param in self.params]
        updates = [(param, param - self.learning_rate * gparam)
                   for param, gparam in zip(self.params, gparams)]

        self._train = theano.function(inputs=[ei, em, di, dm, dt],
                                      outputs=[loss, costs],
                                      updates=updates,
                                      givens={
                                          encoderInputs: ei,
                                          encoderMask: em,
                                          decoderInputs: di,
                                          decoderMask: dm,
                                          decoderTarget: dt
                                      })
        #####################################################
        #####################################################
        hs0, Cs0 = tensor.as_tensor_variable(
            self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos,
                                                             name="Cs0")
        token_idxs = tensor.fill(
            tensor.zeros_like(decoderInputs, dtype="int32"), utils.idx_start)
        msk = tensor.fill((tensor.zeros_like(decoderInputs, dtype="int32")), 1)

        def _step(token_idxs, hs_, Cs_):
            hs, Cs = [], []
            state_below = self.lookuptable[token_idxs].reshape(
                (decoderInputs.shape[0], decoderInputs.shape[1],
                 self.hidden_size))
            for i, lstm in enumerate(self.decoder_lstm_layers):
                h, C = lstm.forward(state_below, msk, hs_[i],
                                    Cs_[i])  #mind msk
                hs += h[-1],
                Cs += C[-1],
                state_below = h
            hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(
                Cs)
            next_token_idx = tensor.cast(
                tensor.dot(state_below, self.linear).argmax(axis=-1), "int32")
            return next_token_idx, hs, Cs

        outputs, updates = theano.scan(fn=_step,
                                       outputs_info=[token_idxs, hs0, Cs0],
                                       n_steps=utils.max_sent_size)
        listof_token_idx = outputs[0]
        self._utter = theano.function(
            inputs=[ei, em, di],
            outputs=listof_token_idx,
            givens={
                encoderInputs: ei,
                encoderMask: em,
                decoderInputs: di
            }
            #givens={encoderInputs:ei, encoderMask:em}
        )
示例#45
0
    def __init__(self,
                 num_actions,
                 phi_length,
                 width,
                 height,
                 discount,
                 learning_rate,
                 decay,
                 momentum=0,
                 batch_size=32,
                 approximator='none'):
        self._batch_size = batch_size
        self._num_input_features = phi_length
        self._phi_length = phi_length
        self._img_width = width
        self._img_height = height
        self._discount = discount
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.decay = decay
        self.momentum = momentum
        self.scale_input_by = 255.0

        # CONSTRUCT THE LAYERS
        self.q_layers = []
        self.q_layers.append(
            layers.Input2DLayer(self._batch_size, self._num_input_features,
                                self._img_height, self._img_width,
                                self.scale_input_by))

        if approximator == 'cuda_conv':
            self.q_layers.append(
                cc_layers.ShuffleBC01ToC01BLayer(self.q_layers[-1]))
            self.q_layers.append(
                cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1],
                                                 n_filters=16,
                                                 filter_size=8,
                                                 stride=4,
                                                 weights_std=.01,
                                                 init_bias_value=0.1))
            self.q_layers.append(
                cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1],
                                                 n_filters=32,
                                                 filter_size=4,
                                                 stride=2,
                                                 weights_std=.01,
                                                 init_bias_value=0.1))
            self.q_layers.append(
                cc_layers.ShuffleC01BToBC01Layer(self.q_layers[-1]))

        elif approximator == 'conv':
            self.q_layers.append(
                layers.StridedConv2DLayer(self.q_layers[-1],
                                          n_filters=16,
                                          filter_width=8,
                                          filter_height=8,
                                          stride_x=4,
                                          stride_y=4,
                                          weights_std=.01,
                                          init_bias_value=0.01))

            self.q_layers.append(
                layers.StridedConv2DLayer(self.q_layers[-1],
                                          n_filters=32,
                                          filter_width=4,
                                          filter_height=4,
                                          stride_x=2,
                                          stride_y=2,
                                          weights_std=.01,
                                          init_bias_value=0.01))
        if approximator == 'cuda_conv' or approximator == 'conv':

            self.q_layers.append(
                layers.DenseLayer(self.q_layers[-1],
                                  n_outputs=256,
                                  weights_std=0.01,
                                  init_bias_value=0.1,
                                  dropout=0,
                                  nonlinearity=layers.rectify))

            self.q_layers.append(
                layers.DenseLayer(self.q_layers[-1],
                                  n_outputs=num_actions,
                                  weights_std=0.01,
                                  init_bias_value=0.1,
                                  dropout=0,
                                  nonlinearity=layers.identity))

        if approximator == 'none':
            self.q_layers.append(\
                layers.DenseLayerNoBias(self.q_layers[-1],
                                        n_outputs=num_actions,
                                        weights_std=0.00,
                                        dropout=0,
                                        nonlinearity=layers.identity))

        self.q_layers.append(layers.OutputLayer(self.q_layers[-1]))

        for i in range(len(self.q_layers) - 1):
            print self.q_layers[i].get_output_shape()

        # Now create a network (using the same weights)
        # for next state q values
        self.next_layers = copy_layers(self.q_layers)
        self.next_layers[0] = layers.Input2DLayer(self._batch_size,
                                                  self._num_input_features,
                                                  self._img_width,
                                                  self._img_height,
                                                  self.scale_input_by)
        self.next_layers[1].input_layer = self.next_layers[0]

        self.rewards = T.col()
        self.actions = T.icol()

        # Build the loss function ...
        q_vals = self.q_layers[-1].predictions()
        next_q_vals = self.next_layers[-1].predictions()
        next_maxes = T.max(next_q_vals, axis=1, keepdims=True)
        target = self.rewards + discount * next_maxes
        target = theano.gradient.consider_constant(target)
        diff = target - q_vals
        # Zero out all entries for actions that were not chosen...
        mask = build_mask(T.zeros_like(diff), self.actions, 1.0)
        diff_masked = diff * mask
        error = T.mean(diff_masked**2)
        self._loss = error * diff_masked.shape[1]  #

        self._parameters = layers.all_parameters(self.q_layers[-1])

        self._idx = T.lscalar('idx')

        # CREATE VARIABLES FOR INPUT AND OUTPUT
        self.states_shared = theano.shared(
            np.zeros((1, 1, 1, 1), dtype=theano.config.floatX))
        self.states_shared_next = theano.shared(
            np.zeros((1, 1, 1, 1), dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(np.zeros(
            (1, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))
        self.actions_shared = theano.shared(np.zeros((1, 1), dtype='int32'),
                                            broadcastable=(False, True))

        self._givens = \
            {self.q_layers[0].input_var:
             self.states_shared[self._idx*self._batch_size:
                                (self._idx+1)*self._batch_size, :, :, :],
             self.next_layers[0].input_var:
             self.states_shared_next[self._idx*self._batch_size:
                                     (self._idx+1)*self._batch_size, :, :, :],

             self.rewards:
             self.rewards_shared[self._idx*self._batch_size:
                                 (self._idx+1)*self._batch_size, :],
             self.actions:
             self.actions_shared[self._idx*self._batch_size:
                                 (self._idx+1)*self._batch_size, :]
             }

        if self.momentum != 0:
            self._updates = layers.gen_updates_rmsprop_and_nesterov_momentum(\
                self._loss, self._parameters, learning_rate=self.learning_rate,
                rho=self.decay, momentum=self.momentum, epsilon=1e-6)
        else:
            self._updates = layers.gen_updates_rmsprop(
                self._loss,
                self._parameters,
                learning_rate=self.learning_rate,
                rho=self.decay,
                epsilon=1e-6)

        self._train = theano.function([self._idx],
                                      self._loss,
                                      givens=self._givens,
                                      updates=self._updates)
        self._compute_loss = theano.function([self._idx],
                                             self._loss,
                                             givens=self._givens)
        self._compute_q_vals = \
            theano.function([self.q_layers[0].input_var],
                            self.q_layers[-1].predictions(),
                            on_unused_input='ignore')
示例#46
0
 def shift_right(x):
     return TT.concatenate([TT.shape_padleft(TT.zeros_like(x[0])), x[:-1]])
示例#47
0
def build_model(tparams, options):
    opt_ret = dict()

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples
    x = tensor.matrix('x', dtype='int64')
    x_mask = tensor.matrix('x_mask', dtype='float32')
    y = tensor.matrix('y', dtype='int64')
    y_mask = tensor.matrix('y_mask', dtype='float32')

    # for the backward rnn, we just need to invert x and x_mask
    xr = x[::-1]
    xr_mask = x_mask[::-1]

    n_timesteps = x.shape[0]
    n_timesteps_trg = y.shape[0]
    n_samples = x.shape[1]

    # word embedding for forward rnn (source)
    emb = tparams['Wemb'][x.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
    proj = get_layer(options['encoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='encoder',
                                            mask=x_mask)
    # word embedding for backward rnn (source)
    embr = tparams['Wemb'][xr.flatten()]
    embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
    projr = get_layer(options['encoder'])[1](tparams,
                                             embr,
                                             options,
                                             prefix='encoder_r',
                                             mask=xr_mask)

    # context will be the concatenation of forward and backward rnns
    ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1)

    # mean of the context (across time) will be used to initialize decoder rnn
    ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]

    # or you can use the last state of forward + backward encoder rnns
    # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2)

    # initial decoder state
    init_state = get_layer('ff')[1](tparams,
                                    ctx_mean,
                                    options,
                                    prefix='ff_state',
                                    activ='tanh')

    # word embedding (target), we will shift the target sequence one time step
    # to the right. This is done because of the bi-gram connections in the
    # readout and decoder rnn. The first target will be all zeros and we will
    # not condition on the last output.
    emb = tparams['Wemb_dec'][y.flatten()]
    emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
    emb_shifted = tensor.zeros_like(emb)
    emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
    emb = emb_shifted

    # decoder - pass through the decoder conditional gru with attention
    proj = get_layer(options['decoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='decoder',
                                            mask=y_mask,
                                            context=ctx,
                                            context_mask=x_mask,
                                            one_step=False,
                                            init_state=init_state)
    # hidden states of the decoder gru
    proj_h = proj[0]

    # weighted averages of context, generated by attention module
    ctxs = proj[1]

    # weights (alignment matrix)
    opt_ret['dec_alphas'] = proj[2]

    # compute word probabilities
    logit_lstm = get_layer('ff')[1](tparams,
                                    proj_h,
                                    options,
                                    prefix='ff_logit_lstm',
                                    activ='linear')
    logit_prev = get_layer('ff')[1](tparams,
                                    emb,
                                    options,
                                    prefix='ff_logit_prev',
                                    activ='linear')
    logit_ctx = get_layer('ff')[1](tparams,
                                   ctxs,
                                   options,
                                   prefix='ff_logit_ctx',
                                   activ='linear')
    logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    logit_shp = logit.shape
    probs = tensor.nnet.softmax(
        logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]]))

    # cost
    y_flat = y.flatten()
    y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
    cost = -tensor.log(probs.flatten()[y_flat_idx])
    cost = cost.reshape([y.shape[0], y.shape[1]])
    cost = (cost * y_mask).sum(0)

    return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost
示例#48
0
    def __init__(self, babi_train_raw, babi_test_raw, word2vec,
                 word_vector_size, memory_hops, dim, mode, input_mask_mode, l2,
                 batch_norm, dropout, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()
        self.vocab = {}
        self.ivocab = {}

        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.mode = mode
        self.input_mask_mode = input_mask_mode
        self.l2 = l2
        self.batch_norm = batch_norm
        self.dropout = dropout
        self.memory_hops = memory_hops

        self.train_input, self.train_q, self.train_answer, self.train_input_mask, self.train_gates = self._process_input(
            babi_train_raw)
        self.test_input, self.test_q, self.test_answer, self.test_input_mask, self.test_gates = self._process_input(
            babi_test_raw)
        self.vocab_size = len(self.vocab)

        print "Train size: ", len(self.train_input)
        print "Test size: ", len(self.test_input)
        print "Vocab size: ", self.vocab_size

        self.input_var = T.matrix('input_var')
        self.q_var = T.matrix('question_var')
        self.answer_var = T.iscalar('answer_var')
        self.input_mask_var = T.ivector('input_mask_var')
        self.gates_var = T.ivector(
            'gates_var')  # attention gate (including end_reading)

        self.attentions = []

        print "==> building input module"
        self.W_inp_res_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_upd_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_hid_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        inp_c_history, _ = theano.scan(fn=self.input_gru_step,
                                       sequences=self.input_var,
                                       outputs_info=T.zeros_like(
                                           self.b_inp_hid))
        self.end_reading = nn_utils.constant_param(value=0.0,
                                                   shape=(1, self.dim))
        inp_c_tag = T.concatenate([inp_c_history, self.end_reading], axis=0)

        self.inp_c = inp_c_tag.take(self.input_mask_var,
                                    axis=0)  #(facts_len,dim)

        self.q_q, _ = theano.scan(fn=self.input_gru_step,
                                  sequences=self.q_var,
                                  outputs_info=T.zeros_like(self.b_inp_hid))
        self.q_q = self.q_q[-1]  #(1,dim)

        print "==> creating parameters for memory module"
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.dim, 7 * self.dim + 2))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, ))

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]
        for iter in range(0, self.memory_hops):
            current_episode, g = self.new_episode(memory[iter])
            self.attentions.append(g)
            memory.append(
                self.GRU_update(memory[iter], current_episode,
                                self.W_mem_res_in, self.W_mem_res_hid,
                                self.b_mem_res, self.W_mem_upd_in,
                                self.W_mem_upd_hid, self.b_mem_upd,
                                self.W_mem_hid_in, self.W_mem_hid_hid,
                                self.b_mem_hid))

        last_mem_raw = memory[-1].dimshuffle(('x', 0))

        net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw)
        if self.dropout > 0 and self.mode == 'train':
            net = layers.DropoutLayer(net, p=self.dropout)
        last_mem = layers.get_output(net)[0]

        self.attentions = T.stack(self.attentions)  #(memory_hops, fact_cnt)

        print "==> building answer module"
        self.W_a = nn_utils.normal_param(std=0.1,
                                         shape=(self.vocab_size, self.dim))
        self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem))

        print "==> collecting all parameters"
        self.params = [
            self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res,
            self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd,
            self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid,
            self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res,
            self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd,
            self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b,
            self.W_1, self.W_2, self.b_1, self.b_2, self.W_a
        ]

        print "==> building loss layer and computing updates"
        self.loss_ce = T.nnet.categorical_crossentropy(
            self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0]

        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss_gate = T.nnet.categorical_crossentropy(
            self.attentions, self.gates_var).mean()

        self.loss = self.loss_ce + self.loss_l2 + self.loss_gate

        updates = lasagne.updates.adam(self.loss, self.params)
        #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.answer_var,
                    self.input_mask_var, self.gates_var
                ],
                allow_input_downcast=True,
                outputs=[self.prediction, self.loss, self.attentions],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(
            inputs=[
                self.input_var, self.q_var, self.answer_var,
                self.input_mask_var, self.gates_var
            ],
            allow_input_downcast=True,
            outputs=[self.prediction, self.loss, self.attentions])

        if self.mode == 'train':
            print "==> computing gradients (for debugging)"
            gradient = T.grad(self.loss, self.params)
            self.get_gradient_fn = theano.function(inputs=[
                self.input_var, self.q_var, self.answer_var,
                self.input_mask_var, self.gates_var
            ],
                                                   allow_input_downcast=True,
                                                   outputs=gradient)
示例#49
0
def get_elementwise_objective(
    policy,
    actions,
    rewards,
    is_alive="always",
    baseline="zeros",
    gamma_or_gammas=0.99,
    crop_last=True,
    treat_policy_as_logpolicy=False,
):
    """
    Compute and return policy gradient as evaluates

        L_policy = - log(policy) * (V_reference - baseline)
        L_V = (V - Vreference)^2


    :param policy: [batch,tick,action_id] - predicted action probabilities
        either for all actions, shape [batch,tick,action]
        or for chosen actions, shape [batch,tick]

    :param actions: [batch,tick] - committed actions
    :param rewards: [batch,tick] - immediate rewards for taking actions at given time ticks
    :param is_alive: [batch,tick] - binary matrix whether given session is active at given tick. Defaults to all ones.
    :param baseline: [batch,tick] - REINFORCE  baselines tensor for each batch/tick. Uses no baseline by default.
    :param gamma_or_gammas: a single value or array[batch,tick](can broadcast dimensions) of delayed reward discounts
    :param crop_last: if True, zeros-out loss at final tick
    :param treat_policy_as_logpolicy: if True, policy is used as log(pi(a|s)). You may want to do this for numerical stability reasons.
    :return: elementwise sum of policy_loss + state_value_loss [batch,tick]

    """

    if is_alive == "always":
        is_alive = T.ones_like(actions, dtype=theano.config.floatX)
    if baseline == "zeros":
        baseline = T.zeros_like(rewards, dtype=theano.config.floatX)
    # check dimensions
    assert policy.ndim in (2,3),"policy must have shape either [batch,tick,action], for all actions," \
                                " or [batch,tick], for chosen actions"

    assert actions.ndim == rewards.ndim == is_alive.ndim == 2, "actions, rewards and is_alive must have shape [batch,time]"

    #logprobas for all actions
    logpolicy = T.log(policy) if not treat_policy_as_logpolicy else policy

    #logprobas for actions taken
    given_action_probas = (logpolicy.ndim == 2)
    action_logprobas = logpolicy if given_action_probas else get_values_for_actions(
        logpolicy, actions)

    #estimate n-step advantage. Note that we use current state values here (and not e.g. state_values_target)
    observed_state_values = get_n_step_value_reference(
        state_values=T.zeros_like(rewards, dtype=theano.config.floatX),
        rewards=rewards,
        is_alive=is_alive,
        n_steps=None,
        gamma_or_gammas=gamma_or_gammas,
        end_at_tmax=True,
        crop_last=crop_last,
    )

    advantage = consider_constant(observed_state_values - baseline)

    loss_elwise = -action_logprobas * advantage * is_alive

    return loss_elwise
示例#50
0
    def build(self):
        # Source sentences: n_timesteps, n_samples
        x = tensor.matrix('x', dtype=INT)
        x_mask = tensor.matrix('x_mask', dtype=FLOAT)

        # Image: 196 (n_annotations) x n_samples x 1024 (conv_dim)
        x_img = tensor.tensor3('x_img', dtype=FLOAT)

        # Target sentences: n_timesteps, n_samples
        y = tensor.matrix('y', dtype=INT)
        y_mask = tensor.matrix('y_mask', dtype=FLOAT)

        # Reverse stuff
        xr = x[::-1]
        xr_mask = x_mask[::-1]

        # Some shorthands for dimensions
        n_samples = x.shape[1]
        n_timesteps = x.shape[0]
        n_timesteps_trg = y.shape[0]

        # Store tensors
        self.inputs = OrderedDict()
        self.inputs['x'] = x  # Source words
        self.inputs['x_mask'] = x_mask  # Source mask
        self.inputs['x_img'] = x_img  # Image features
        self.inputs['y'] = y  # Target labels
        self.inputs['y_mask'] = y_mask  # Target mask

        ###################
        # Source embeddings
        ###################
        # word embedding for forward rnn (source)
        emb = dropout(self.tparams['Wemb_enc'][x.flatten()], self.trng,
                      self.emb_dropout, self.use_dropout)
        emb = emb.reshape([n_timesteps, n_samples, self.embedding_dim])
        forw = get_new_layer('gru')[1](self.tparams,
                                       emb,
                                       prefix='text_encoder',
                                       mask=x_mask,
                                       layernorm=self.lnorm)

        # word embedding for backward rnn (source)
        embr = dropout(self.tparams['Wemb_enc'][xr.flatten()], self.trng,
                       self.emb_dropout, self.use_dropout)
        embr = embr.reshape([n_timesteps, n_samples, self.embedding_dim])
        back = get_new_layer('gru')[1](self.tparams,
                                       embr,
                                       prefix='text_encoder_r',
                                       mask=xr_mask,
                                       layernorm=self.lnorm)

        # Source context will be the concatenation of forward and backward rnns
        # leading to a vector of 2*rnn_dim for each timestep
        text_ctx = tensor.concatenate([forw[0], back[0][::-1]],
                                      axis=forw[0].ndim - 1)
        # -> n_timesteps x n_samples x 2*rnn_dim

        # Apply dropout
        text_ctx = dropout(text_ctx, self.trng, self.ctx_dropout,
                           self.use_dropout)

        if self.init_cgru == 'text':
            # mean of the context (across time) will be used to initialize decoder rnn
            text_ctx_mean = (
                text_ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]
            # -> n_samples x ctx_dim (2*rnn_dim)

            # initial decoder state computed from source context mean
            init_state = get_new_layer('ff')[1](self.tparams,
                                                text_ctx_mean,
                                                prefix='ff_text_state_init',
                                                activ='tanh')
            # -> n_samples x rnn_dim (last dim shrinked down by this FF to rnn_dim)
        elif self.init_cgru == 'img':
            # Reduce to nb_samples x conv_dim and transform
            init_state = get_new_layer('ff')[1](self.tparams,
                                                x_img.mean(axis=0),
                                                prefix='ff_img_state_init',
                                                activ='tanh')
        elif self.init_cgru == 'textimg':
            # n_samples x conv_dim
            img_ctx_mean = x_img.mean(axis=0)
            # n_samples x ctx_dim
            text_ctx_mean = (
                text_ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]
            # n_samples x (conv_dim + ctx_dim)
            mmodal_ctx = tensor.concatenate([img_ctx_mean, text_ctx_mean],
                                            axis=-1)
            init_state = get_new_layer('ff')[1](self.tparams,
                                                mmodal_ctx,
                                                prefix='ff_textimg_state_init',
                                                activ='tanh')
        else:
            init_state = tensor.alloc(0., n_samples, self.rnn_dim)

        #######################
        # Source image features
        #######################

        # Project image features to ctx_dim
        img_ctx = get_new_layer('ff')[1](self.tparams,
                                         x_img,
                                         prefix='ff_img_adaptor',
                                         activ='linear')
        # -> 196 x n_samples x ctx_dim

        ####################
        # Target embeddings
        ####################

        # Fetch target embeddings. Result is: (n_trg_timesteps x n_samples x embedding_dim)
        emb_trg = self.tparams['Wemb_dec'][y.flatten()]
        emb_trg = emb_trg.reshape(
            [n_timesteps_trg, n_samples, self.embedding_dim])

        # Shift it to right to leave place for the <bos> placeholder
        # We ignore the last word <eos> as we don't condition on it at the end
        # to produce another word
        emb_trg_shifted = tensor.zeros_like(emb_trg)
        emb_trg_shifted = tensor.set_subtensor(emb_trg_shifted[1:],
                                               emb_trg[:-1])
        emb_trg = emb_trg_shifted

        ##########
        # GRU Cond
        ##########
        # decoder - pass through the decoder conditional gru with attention
        dec_mult = self.gru_decoder(self.tparams,
                                    emb_trg,
                                    prefix='decoder_multi',
                                    input_mask=y_mask,
                                    ctx1=text_ctx,
                                    ctx1_mask=x_mask,
                                    ctx2=img_ctx,
                                    one_step=False,
                                    init_state=init_state)

        # gru_cond returns hidden state, weighted sum of context vectors and attentional weights.
        h = dec_mult[0]  # (n_timesteps_trg, batch_size, rnn_dim)
        sumctx = dec_mult[
            1]  # (n_timesteps_trg, batch_size, ctx*.shape[-1] (2000, 2*rnn_dim))
        # weights (alignment matrix)
        self.alphas = list(dec_mult[2:])

        # 3-way merge
        logit_gru = get_new_layer('ff')[1](self.tparams,
                                           h,
                                           prefix='ff_logit_gru',
                                           activ='linear')
        logit_ctx = get_new_layer('ff')[1](self.tparams,
                                           sumctx,
                                           prefix='ff_logit_ctx',
                                           activ='linear')

        # Dropout
        logit = dropout(tanh(logit_gru + emb_trg + logit_ctx), self.trng,
                        self.out_dropout, self.use_dropout)

        if self.tied_trg_emb is False:
            logit = get_new_layer('ff')[1](self.tparams,
                                           logit,
                                           prefix='ff_logit',
                                           activ='linear')
        else:
            logit = tensor.dot(logit, self.tparams['Wemb_dec'].T)

        logit_shp = logit.shape

        # Apply logsoftmax (stable version)
        log_probs = -tensor.nnet.logsoftmax(
            logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]]))

        # cost
        y_flat = y.flatten()
        y_flat_idx = tensor.arange(y_flat.shape[0]) * self.n_words_trg + y_flat

        cost = log_probs.flatten()[y_flat_idx]
        cost = cost.reshape([n_timesteps_trg, n_samples])
        cost = (cost * y_mask).sum(0)

        self.f_log_probs = theano.function(list(self.inputs.values()), cost)

        return cost
示例#51
0
    def test_grad_h(self):

        "tests that the gradients with respect to h_i are 0 after doing a mean field update of h_i "

        model = self.model
        e_step = self.e_step
        X = self.X

        assert X.shape[0] == self.m

        init_H = e_step.init_H_hat(V = X)
        init_Mu1 = e_step.init_S_hat(V = X)

        prev_setting = config.compute_test_value
        config.compute_test_value= 'off'
        H, Mu1 = function([], outputs=[init_H, init_Mu1])()
        config.compute_test_value = prev_setting

        H = broadcast(H, self.m)
        Mu1 = broadcast(Mu1, self.m)

        H = np.cast[config.floatX](self.model.rng.uniform(0.,1.,H.shape))
        Mu1 = np.cast[config.floatX](self.model.rng.uniform(-5.,5.,Mu1.shape))


        H_var = T.matrix(name='H_var')
        H_var.tag.test_value = H
        Mu1_var = T.matrix(name='Mu1_var')
        Mu1_var.tag.test_value = Mu1
        idx = T.iscalar()
        idx.tag.test_value = 0


        new_H = e_step.infer_H_hat(V = X, H_hat = H_var, S_hat = Mu1_var)
        h_idx = new_H[:,idx]

        updates_func = function([H_var,Mu1_var,idx], h_idx)

        sigma0 = 1. / model.alpha
        Sigma1 = e_step.infer_var_s1_hat()
        mu0 = T.zeros_like(model.mu)

        #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1
        # (they don't affect the outcome of this test and some of them are intractable )
        trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \
                     model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var,  var_s0_hat = sigma0,
                             var_s1_hat = Sigma1)

        grad_H = T.grad(trunc_kl.sum(), H_var)

        assert len(grad_H.type.broadcastable) == 2

        #from theano.printing import min_informative_str
        #print min_informative_str(grad_H)

        #grad_H = Print('grad_H')(grad_H)

        #grad_H_idx = grad_H[:,idx]

        grad_func = function([H_var, Mu1_var], grad_H)

        failed = False

        for i in xrange(self.N):
            rval = updates_func(H, Mu1, i)
            H[:,i] = rval

            g = grad_func(H,Mu1)[:,i]

            assert not np.any(np.isnan(g))

            g_abs_max = np.abs(g).max()

            if g_abs_max > self.tol:
                #print "new values of H"
                #print H[:,i]
                #print "gradient on new values of H"
                #print g

                failed = True

                print 'iteration ',i
                #print 'max value of new H: ',H[:,i].max()
                #print 'H for failing g: '
                failing_h = H[np.abs(g) > self.tol, i]
                #print failing_h

                #from matplotlib import pyplot as plt
                #plt.scatter(H[:,i],g)
                #plt.show()

                #ignore failures extremely close to h=1

                high_mask = failing_h > .001
                low_mask = failing_h < .999

                mask = high_mask * low_mask

                print 'masked failures: ',mask.shape[0],' err ',g_abs_max

                if mask.sum() > 0:
                    print 'failing h passing the range mask'
                    print failing_h[ mask.astype(bool) ]
                    raise Exception('after mean field step, gradient of kl divergence'
                            ' wrt freshly updated variational parameter should be 0, '
                            'but here the max magnitude of a gradient element is '
                            +str(g_abs_max)+' after updating h_'+str(i))
示例#52
0
def edge_potn(pdf, copula, theta, edges,Y=None, shared_copula=False):
    '''
    '''
    cdf = TT.extra_ops.cumsum(pdf,axis=2)
    cdf = TT.concatenate((TT.zeros_like(cdf[:,:,[0]]),cdf),axis=2)

    def comp_jpdf(cdf, d, y=None):
        '''
        cdf : list of cdfs              [cdf_1, cdf_2]
        y : list of vecotr of labels    [y_1, y_1]
        '''
        idx = TT.arange(cdf.shape[1])
        if y:
            u_0 = cdf[0,idx,y[0]]
            u_1 = cdf[0,idx,y[0]+1]
            v_0 = cdf[1,idx,y[1]]
            v_1 = cdf[1,idx,y[1]+1]

            if shared_copula:
                pass
            else:
                d = d[y[0],y[1]]

            P =  copula(u_0,v_0,d)
            P -= copula(u_0,v_1,d)
            P -= copula(u_1,v_0,d)
            P += copula(u_1,v_1,d)
        else:

            cdf_0 = TT.extra_ops.repeat(cdf[0].dimshuffle(0,1,'x'),cdf[1].shape[1],2)
            cdf_1 = TT.extra_ops.repeat(cdf[1].dimshuffle(0,'x',1),cdf[0].shape[1],1)

            if shared_copula:
                j_cdf = copula(cdf_0,cdf_1,d)
                P = j_cdf[:,1:,1:] + j_cdf[:,:-1,:-1] - j_cdf[:,:-1,1:] - j_cdf[:,1:,:-1]
            else:
                u11 = cdf_0[:,1:,1:]
                u01 = cdf_0[:,:-1,1:]
                u10 = cdf_0[:,1:,:-1]
                u00 = cdf_0[:,:-1,:-1]

                v11 = cdf_1[:,1:,1:]
                v01 = cdf_1[:,:-1,1:]
                v10 = cdf_1[:,1:,:-1]
                v00 = cdf_1[:,:-1,:-1]

                d = d.dimshuffle('x',0,1)
                d = TT.extra_ops.repeat(d,cdf.shape[1],0)

                uv_11 = copula(u11,v11,d)
                uv_00 = copula(u00,v00,d)
                uv_01 = copula(u01,v01,d)
                uv_10 = copula(u10,v10,d)

                P = uv_00+uv_11-uv_10-uv_01
        return P


    cdf = cdf.dimshuffle(1,0,2)
    if Y != None:
        Y = Y.T.astype('int8')
    edges = edges.T.astype('int8')

    def inner_function(e, t, cdf):
        if Y == None:
            jpdf = comp_jpdf(cdf[e], t)
        else:
            jpdf = comp_jpdf(cdf[e], t, Y[e])
        return jpdf

    # inner_function(edges[0],theta[0],cdf)

    jpdf , _ = T.scan(
        fn=inner_function,
        sequences=[edges, theta],
        non_sequences=[cdf]
    )

    return -log_prob(jpdf)
示例#53
0
    def test_grad_s(self):

        "tests that the gradients with respect to s_i are 0 after doing a mean field update of s_i "

        model = self.model
        e_step = self.e_step
        X = self.X

        assert X.shape[0] == self.m

        model.test_batch_size = X.shape[0]

        init_H = e_step.init_H_hat(V = X)
        init_Mu1 = e_step.init_S_hat(V = X)

        prev_setting = config.compute_test_value
        config.compute_test_value= 'off'
        H, Mu1 = function([], outputs=[init_H, init_Mu1])()
        config.compute_test_value = prev_setting

        H = broadcast(H, self.m)
        Mu1 = broadcast(Mu1, self.m)

        H = np.cast[config.floatX](self.model.rng.uniform(0.,1.,H.shape))
        Mu1 = np.cast[config.floatX](self.model.rng.uniform(-5.,5.,Mu1.shape))



        H_var = T.matrix(name='H_var')
        H_var.tag.test_value = H
        Mu1_var = T.matrix(name='Mu1_var')
        Mu1_var.tag.test_value = Mu1
        idx = T.iscalar()
        idx.tag.test_value = 0


        S = e_step.infer_S_hat(V = X, H_hat = H_var, S_hat = Mu1_var)

        s_idx = S[:,idx]

        s_i_func = function([H_var,Mu1_var,idx],s_idx)

        sigma0 = 1. / model.alpha
        Sigma1 = e_step.infer_var_s1_hat()
        mu0 = T.zeros_like(model.mu)

        #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1
        # (they don't affect the outcome of this test and some of them are intractable )
        trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \
                     model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var, var_s0_hat = sigma0, var_s1_hat = Sigma1)

        grad_Mu1 = T.grad(trunc_kl.sum(), Mu1_var)

        grad_Mu1_idx = grad_Mu1[:,idx]

        grad_func = function([H_var, Mu1_var, idx], grad_Mu1_idx)

        for i in xrange(self.N):
            Mu1[:,i] = s_i_func(H, Mu1, i)

            g = grad_func(H,Mu1,i)

            assert not np.any(np.isnan(g))

            g_abs_max = np.abs(g).max()


            if g_abs_max > self.tol:
                raise Exception('after mean field step, gradient of kl divergence wrt mean field parameter should be 0, but here the max magnitude of a gradient element is '+str(g_abs_max)+' after updating s_'+str(i))
示例#54
0
    def test_value_h(self):

        "tests that the value of the kl divergence decreases with each update to h_i "

        model = self.model
        e_step = self.e_step
        X = self.X

        assert X.shape[0] == self.m

        init_H = e_step.init_H_hat(V = X)
        init_Mu1 = e_step.init_S_hat(V = X)

        prev_setting = config.compute_test_value
        config.compute_test_value= 'off'
        H, Mu1 = function([], outputs=[init_H, init_Mu1])()
        config.compute_test_value = prev_setting

        H = broadcast(H, self.m)
        Mu1 = broadcast(Mu1, self.m)

        H = np.cast[config.floatX](self.model.rng.uniform(0.,1.,H.shape))
        Mu1 = np.cast[config.floatX](self.model.rng.uniform(-5.,5.,Mu1.shape))


        H_var = T.matrix(name='H_var')
        H_var.tag.test_value = H
        Mu1_var = T.matrix(name='Mu1_var')
        Mu1_var.tag.test_value = Mu1
        idx = T.iscalar()
        idx.tag.test_value = 0

        newH = e_step.infer_H_hat(V = X, H_hat = H_var, S_hat = Mu1_var)


        h_idx = newH[:,idx]


        h_i_func = function([H_var,Mu1_var,idx],h_idx)

        sigma0 = 1. / model.alpha
        Sigma1 = e_step.infer_var_s1_hat()
        mu0 = T.zeros_like(model.mu)

        #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1
        # (they don't affect the outcome of this test and some of them are intractable )
        trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \
                     model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var, var_s0_hat = sigma0, var_s1_hat = Sigma1)

        trunc_kl_func = function([H_var, Mu1_var], trunc_kl)

        for i in xrange(self.N):
            prev_kl = trunc_kl_func(H,Mu1)

            H[:,i] = h_i_func(H, Mu1, i)
            #we don't update mu, the whole point of the split e step is we don't have to

            new_kl = trunc_kl_func(H,Mu1)


            increase = new_kl - prev_kl


            print 'failures after iteration ',i,': ',(increase > self.tol).sum()

            mx = increase.max()

            if mx > 1e-4:
                print 'increase amounts of failing examples:'
                print increase[increase > self.tol]
                print 'failing H:'
                print H[increase > self.tol,:]
                print 'failing Mu1:'
                print Mu1[increase > self.tol,:]
                print 'failing V:'
                print X[increase > self.tol,:]


                raise Exception('after mean field step in h, kl divergence should decrease, but some elements increased by as much as '+str(mx)+' after updating h_'+str(i))
示例#55
0
def experiment(state, channel):
    if state.test_model and 'config' in os.listdir('.'):
        print 'Loading local config file'
        config_file = open('config', 'r')
        config = config_file.readlines()
        try:
            config_vals = config[0].split('(')[1:][0].split(')')[:-1][0].split(
                ', ')
        except:
            config_vals = config[0][3:-1].replace(': ',
                                                  '=').replace("'",
                                                               "").split(', ')
            config_vals = filter(
                lambda x: not 'jobman' in x and not '/' in x and not ':' in x
                and not 'experiment' in x, config_vals)

        for CV in config_vals:
            print CV
            if CV.startswith('test'):
                print 'Do not override testing switch'
                continue
            try:
                exec('state.' + CV) in globals(), locals()
            except:
                exec('state.' + CV.split('=')[0] + "='" + CV.split('=')[1] +
                     "'") in globals(), locals()

    else:
        # Save the current configuration
        # Useful for logs/experiments
        print 'Saving config'
        f = open('config', 'w')
        f.write(str(state))
        f.close()

    print state
    # Load the data, train = train+valid, and shuffle train
    # Targets are not used (will be misaligned after shuffling train
    if state.dataset == 'MNIST':
        (train_X, train_Y), (valid_X,
                             valid_Y), (test_X,
                                        test_Y) = load_mnist(state.data_path)
        train_X = numpy.concatenate((train_X, valid_X))

    elif state.dataset == 'MNIST_binary':
        (train_X,
         train_Y), (valid_X,
                    valid_Y), (test_X,
                               test_Y) = load_mnist_binary(state.data_path)
        train_X = numpy.concatenate((train_X, valid_X))

    elif state.dataset == 'TFD':
        (train_X, train_Y), (valid_X,
                             valid_Y), (test_X,
                                        test_Y) = load_tfd(state.data_path)

    N_input = train_X.shape[1]
    root_N_input = numpy.sqrt(N_input)
    numpy.random.seed(1)
    numpy.random.shuffle(train_X)
    train_X = theano.shared(train_X)
    valid_X = theano.shared(valid_X)
    test_X = theano.shared(test_X)

    # Theano variables and RNG
    X = T.fmatrix()
    index = T.lscalar()
    MRG = RNG_MRG.MRG_RandomStreams(1)

    # Network and training specifications
    K = state.K  # N hidden layers
    N = state.N  # number of walkbacks
    layer_sizes = [
        N_input
    ] + [state.hidden_size
         ] * K  # layer sizes, from h0 to hK (h0 is the visible layer)
    learning_rate = theano.shared(cast32(state.learning_rate))  # learning rate
    annealing = cast32(state.annealing)  # exponential annealing coefficient
    momentum = theano.shared(cast32(state.momentum))  # momentum term

    # THEANO VARIABLES
    X = T.fmatrix()  # Input of the graph
    index = T.lscalar()  # index to minibatch
    MRG = RNG_MRG.MRG_RandomStreams(1)

    # PARAMETERS : weights list and bias list.
    # initialize a list of weights and biases based on layer_sizes
    weights_list = [
        get_shared_weights(
            layer_sizes[i], layer_sizes[i + 1],
            numpy.sqrt(6. / (layer_sizes[i] + layer_sizes[i + 1])), 'W')
        for i in range(K)
    ]
    bias_list = [get_shared_bias(layer_sizes[i], 'b') for i in range(K + 1)]

    if state.test_model:
        # Load the parameters of the last epoch
        # maybe if the path is given, load these specific attributes
        param_files = filter(lambda x: 'params' in x, os.listdir('.'))
        max_epoch_idx = numpy.argmax(
            [int(x.split('_')[-1].split('.')[0]) for x in param_files])
        params_to_load = param_files[max_epoch_idx]
        PARAMS = cPickle.load(open(params_to_load, 'r'))
        [
            p.set_value(lp.get_value(borrow=False))
            for lp, p in zip(PARAMS[:len(weights_list)], weights_list)
        ]
        [
            p.set_value(lp.get_value(borrow=False))
            for lp, p in zip(PARAMS[len(weights_list):], bias_list)
        ]

    # Util functions
    def dropout(IN, p=0.5):
        noise = MRG.binomial(p=p, n=1, size=IN.shape, dtype='float32')
        OUT = (IN * noise) / cast32(p)
        return OUT

    def add_gaussian_noise(IN, std=1):
        print 'GAUSSIAN NOISE : ', std
        noise = MRG.normal(avg=0, std=std, size=IN.shape, dtype='float32')
        OUT = IN + noise
        return OUT

    def corrupt_input(IN, p=0.5):
        # salt and pepper? masking?
        noise = MRG.binomial(p=p, n=1, size=IN.shape, dtype='float32')
        IN = IN * noise
        return IN

    def salt_and_pepper(IN, p=0.2):
        # salt and pepper noise
        print 'DAE uses salt and pepper noise'
        a = MRG.binomial(size=IN.shape, n=1, p=1 - p, dtype='float32')
        b = MRG.binomial(size=IN.shape, n=1, p=0.5, dtype='float32')
        c = T.eq(a, 0) * b
        return IN * a + c

    # Odd layer update function
    # just a loop over the odd layers
    def update_odd_layers(hiddens, noisy):
        for i in range(1, K + 1, 2):
            print i
            if noisy:
                simple_update_layer(hiddens, None, i)
            else:
                simple_update_layer(hiddens, None, i, add_noise=False)

    # Even layer update
    # p_X_chain is given to append the p(X|...) at each update (one update = odd update + even update)
    def update_even_layers(hiddens, p_X_chain, noisy):
        for i in range(0, K + 1, 2):
            print i
            if noisy:
                simple_update_layer(hiddens, p_X_chain, i)
            else:
                simple_update_layer(hiddens, p_X_chain, i, add_noise=False)

    # The layer update function
    # hiddens   :   list containing the symbolic theano variables [visible, hidden1, hidden2, ...]
    #               layer_update will modify this list inplace
    # p_X_chain :   list containing the successive p(X|...) at each update
    #               update_layer will append to this list
    # add_noise     : pre and post activation gaussian noise

    def simple_update_layer(hiddens, p_X_chain, i, add_noise=True):
        # Compute the dot product, whatever layer
        post_act_noise = 0

        if i == 0:
            hiddens[i] = T.dot(hiddens[i + 1],
                               weights_list[i].T) + bias_list[i]

        elif i == K:
            hiddens[i] = T.dot(hiddens[i - 1],
                               weights_list[i - 1]) + bias_list[i]

        else:
            # next layer        :   layers[i+1], assigned weights : W_i
            # previous layer    :   layers[i-1], assigned weights : W_(i-1)
            hiddens[i] = T.dot(hiddens[i + 1], weights_list[i].T) + T.dot(
                hiddens[i - 1], weights_list[i - 1]) + bias_list[i]

        # Add pre-activation noise if NOT input layer
        if i == 1 and state.noiseless_h1:
            print '>>NO noise in first layer'
            add_noise = False

        # pre activation noise
        if i != 0 and add_noise:
            print 'Adding pre-activation gaussian noise'
            hiddens[i] = add_gaussian_noise(hiddens[i],
                                            state.hidden_add_noise_sigma)

        # ACTIVATION!
        if i == 0:
            print 'Sigmoid units'
            hiddens[i] = T.nnet.sigmoid(hiddens[i])
        else:
            print 'Hidden units'
            hiddens[i] = hidden_activation(hiddens[i])

        # post activation noise
        if i != 0 and add_noise:
            print 'Adding post-activation gaussian noise'
            hiddens[i] = add_gaussian_noise(hiddens[i],
                                            state.hidden_add_noise_sigma)

        # build the reconstruction chain
        if i == 0:
            # if input layer -> append p(X|...)
            p_X_chain.append(hiddens[i])

            # sample from p(X|...)
            if state.input_sampling:
                print 'Sampling from input'
                sampled = MRG.binomial(p=hiddens[i],
                                       size=hiddens[i].shape,
                                       dtype='float32')
            else:
                print '>>NO input sampling'
                sampled = hiddens[i]
            # add noise
            sampled = salt_and_pepper(sampled, state.input_salt_and_pepper)

            # set input layer
            hiddens[i] = sampled

    def update_layers(hiddens, p_X_chain, noisy=True):
        print 'odd layer update'
        update_odd_layers(hiddens, noisy)
        print
        print 'even layer update'
        update_even_layers(hiddens, p_X_chain, noisy)

    ''' F PROP '''
    #X = T.fmatrix()
    if state.act == 'sigmoid':
        print 'Using sigmoid activation'
        hidden_activation = T.nnet.sigmoid
    elif state.act == 'rectifier':
        print 'Using rectifier activation'
        hidden_activation = lambda x: T.maximum(cast32(0), x)
    elif state.act == 'tanh':
        hidden_activation = lambda x: T.tanh(x)
    ''' Corrupt X '''
    X_corrupt = salt_and_pepper(X, state.input_salt_and_pepper)
    ''' hidden layer init '''

    hiddens = [X_corrupt]
    p_X_chain = []
    print "Hidden units initialization"
    for w, b in zip(weights_list, bias_list[1:]):
        # init with zeros
        print "Init hidden units at zero before creating the graph"
        hiddens.append(T.zeros_like(T.dot(hiddens[-1], w)))

    # The layer update scheme
    print "Building the graph :", N, "updates"
    for i in range(N):
        update_layers(hiddens, p_X_chain)

    # COST AND GRADIENTS

    print 'Cost w.r.t p(X|...) at every step in the graph'
    #COST        =   T.mean(T.nnet.binary_crossentropy(reconstruction, X))
    COST = [T.mean(T.nnet.binary_crossentropy(rX, X)) for rX in p_X_chain]
    show_COST = COST[-1]
    COST = numpy.sum(COST)

    params = weights_list + bias_list

    gradient = T.grad(COST, params)

    gradient_buffer = [
        theano.shared(numpy.zeros(x.get_value().shape, dtype='float32'))
        for x in params
    ]

    m_gradient = [
        momentum * gb + (cast32(1) - momentum) * g
        for (gb, g) in zip(gradient_buffer, gradient)
    ]
    g_updates = [(p, p - learning_rate * mg)
                 for (p, mg) in zip(params, m_gradient)]
    b_updates = zip(gradient_buffer, m_gradient)

    updates = OrderedDict(g_updates + b_updates)

    f_cost = theano.function(inputs=[X], outputs=show_COST)

    indexed_batch = train_X[index * state.batch_size:(index + 1) *
                            state.batch_size]
    sampled_batch = MRG.binomial(p=indexed_batch,
                                 size=indexed_batch.shape,
                                 dtype='float32')

    f_learn = theano.function(inputs=[index],
                              updates=updates,
                              givens={X: indexed_batch},
                              outputs=show_COST)

    f_test = theano.function(inputs=[X],
                             outputs=[X_corrupt] + hiddens[0] + p_X_chain,
                             on_unused_input='warn')

    #############
    # Denoise some numbers  :   show number, noisy number, reconstructed number
    #############
    import random as R
    R.seed(1)
    random_idx = numpy.array(R.sample(range(len(test_X.get_value())), 100))
    numbers = test_X.get_value()[random_idx]

    f_noise = theano.function(inputs=[X],
                              outputs=salt_and_pepper(
                                  X, state.input_salt_and_pepper))
    noisy_numbers = f_noise(test_X.get_value()[random_idx])

    # Recompile the graph without noise for reconstruction function
    hiddens_R = [X]
    p_X_chain_R = []

    for w, b in zip(weights_list, bias_list[1:]):
        # init with zeros
        hiddens_R.append(T.zeros_like(T.dot(hiddens_R[-1], w)))

    # The layer update scheme
    for i in range(N):
        update_layers(hiddens_R, p_X_chain_R, noisy=False)

    f_recon = theano.function(inputs=[X], outputs=p_X_chain_R[-1])

    ############
    # Sampling #
    ############

    # the input to the sampling function
    network_state_input = [X] + [T.fmatrix() for i in range(K)]

    # "Output" state of the network (noisy)
    # initialized with input, then we apply updates
    #network_state_output    =   network_state_input

    network_state_output = [X] + network_state_input[1:]

    visible_pX_chain = []

    # ONE update
    update_layers(network_state_output, visible_pX_chain, noisy=True)

    if K == 1:
        f_sample_simple = theano.function(inputs=[X],
                                          outputs=visible_pX_chain[-1])

    # WHY IS THERE A WARNING????
    # because the first odd layers are not used -> directly computed FROM THE EVEN layers
    # unused input = warn
    f_sample2 = theano.function(inputs=network_state_input,
                                outputs=network_state_output +
                                visible_pX_chain,
                                on_unused_input='warn')

    def sample_some_numbers_single_layer():
        x0 = test_X.get_value()[:1]
        samples = [x0]
        x = f_noise(x0)
        for i in range(399):
            x = f_sample_simple(x)
            samples.append(x)
            x = numpy.random.binomial(n=1, p=x, size=x.shape).astype('float32')
            x = f_noise(x)
        return numpy.vstack(samples)

    def sampling_wrapper(NSI):
        out = f_sample2(*NSI)
        NSO = out[:len(network_state_output)]
        vis_pX_chain = out[len(network_state_output):]
        return NSO, vis_pX_chain

    def sample_some_numbers(N=400):
        # The network's initial state
        init_vis = test_X.get_value()[:1]

        noisy_init_vis = f_noise(init_vis)

        network_state = [[noisy_init_vis] + [
            numpy.zeros((1, len(b.get_value())), dtype='float32')
            for b in bias_list[1:]
        ]]

        visible_chain = [init_vis]

        noisy_h0_chain = [noisy_init_vis]

        for i in range(N - 1):

            # feed the last state into the network, compute new state, and obtain visible units expectation chain
            net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1])

            # append to the visible chain
            visible_chain += vis_pX_chain

            # append state output to the network state chain
            network_state.append(net_state_out)

            noisy_h0_chain.append(net_state_out[0])

        return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain)

    def plot_samples(epoch_number):
        to_sample = time.time()
        if K == 1:
            # one layer model
            V = sample_some_numbers_single_layer()
        else:
            V, H0 = sample_some_numbers()
        img_samples = PIL.Image.fromarray(
            tile_raster_images(V, (root_N_input, root_N_input), (20, 20)))

        fname = 'samples_epoch_' + str(epoch_number) + '.png'
        img_samples.save(fname)
        print 'Took ' + str(time.time() - to_sample) + ' to sample 400 numbers'

    ##############
    # Inpainting #
    ##############
    def inpainting(digit):
        # The network's initial state

        # NOISE INIT
        init_vis = cast32(numpy.random.uniform(size=digit.shape))

        #noisy_init_vis  =   f_noise(init_vis)
        #noisy_init_vis  =   cast32(numpy.random.uniform(size=init_vis.shape))

        # INDEXES FOR VISIBLE AND NOISY PART
        noise_idx = (numpy.arange(N_input) % root_N_input < (root_N_input / 2))
        fixed_idx = (numpy.arange(N_input) % root_N_input > (root_N_input / 2))

        # function to re-init the visible to the same noise

        # FUNCTION TO RESET HALF VISIBLE TO DIGIT
        def reset_vis(V):
            V[0][fixed_idx] = digit[0][fixed_idx]
            return V

        # INIT DIGIT : NOISE and RESET HALF TO DIGIT
        init_vis = reset_vis(init_vis)

        network_state = [[init_vis] + [
            numpy.zeros((1, len(b.get_value())), dtype='float32')
            for b in bias_list[1:]
        ]]

        visible_chain = [init_vis]

        noisy_h0_chain = [init_vis]

        for i in range(49):

            # feed the last state into the network, compute new state, and obtain visible units expectation chain
            net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1])

            # reset half the digit
            net_state_out[0] = reset_vis(net_state_out[0])
            vis_pX_chain[0] = reset_vis(vis_pX_chain[0])

            # append to the visible chain
            visible_chain += vis_pX_chain

            # append state output to the network state chain
            network_state.append(net_state_out)

            noisy_h0_chain.append(net_state_out[0])

        return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain)

    def save_params(n, params):
        print 'saving parameters...'
        save_path = 'params_epoch_' + str(n) + '.pkl'
        f = open(save_path, 'wb')
        try:
            cPickle.dump(params, f, protocol=cPickle.HIGHEST_PROTOCOL)
        finally:
            f.close()

    # TRAINING
    n_epoch = state.n_epoch
    batch_size = state.batch_size
    STOP = False
    counter = 0

    train_costs = []
    valid_costs = []
    test_costs = []

    if state.vis_init:
        bias_list[0].set_value(
            logit(numpy.clip(0.9, 0.001,
                             train_X.get_value().mean(axis=0))))

    if state.test_model:
        # If testing, do not train and go directly to generating samples, parzen window estimation, and inpainting
        print 'Testing : skip training'
        STOP = True

    while not STOP:
        counter += 1
        t = time.time()
        print counter, '\t',

        #train
        train_cost = []
        for i in range(len(train_X.get_value(borrow=True)) / batch_size):
            #train_cost.append(f_learn(train_X[i * batch_size : (i+1) * batch_size]))
            #training_idx = numpy.array(range(i*batch_size, (i+1)*batch_size), dtype='int32')
            train_cost.append(f_learn(i))
        train_cost = numpy.mean(train_cost)
        train_costs.append(train_cost)
        print 'Train : ', trunc(train_cost), '\t',

        #valid
        valid_cost = []
        for i in range(len(valid_X.get_value(borrow=True)) / 100):
            valid_cost.append(
                f_cost(valid_X.get_value()[i * 100:(i + 1) * batch_size]))
        valid_cost = numpy.mean(valid_cost)
        #valid_cost  =   123
        valid_costs.append(valid_cost)
        print 'Valid : ', trunc(valid_cost), '\t',

        #test
        test_cost = []
        for i in range(len(test_X.get_value(borrow=True)) / 100):
            test_cost.append(
                f_cost(test_X.get_value()[i * 100:(i + 1) * batch_size]))
        test_cost = numpy.mean(test_cost)
        test_costs.append(test_cost)
        print 'Test  : ', trunc(test_cost), '\t',

        if counter >= n_epoch:
            STOP = True

        print 'time : ', trunc(time.time() - t),

        print 'MeanVisB : ', trunc(bias_list[0].get_value().mean()),

        print 'W : ', [
            trunc(abs(w.get_value(borrow=True)).mean()) for w in weights_list
        ]

        if (counter % 5) == 0:
            # Checking reconstruction
            reconstructed = f_recon(noisy_numbers)
            # Concatenate stuff
            stacked = numpy.vstack([
                numpy.vstack([
                    numbers[i * 10:(i + 1) * 10],
                    noisy_numbers[i * 10:(i + 1) * 10],
                    reconstructed[i * 10:(i + 1) * 10]
                ]) for i in range(10)
            ])

            number_reconstruction = PIL.Image.fromarray(
                tile_raster_images(stacked, (root_N_input, root_N_input),
                                   (10, 30)))
            #epoch_number    =   reduce(lambda x,y : x + y, ['_'] * (4-len(str(counter)))) + str(counter)
            number_reconstruction.save('number_reconstruction' + str(counter) +
                                       '.png')

            #sample_numbers(counter, 'seven')
            plot_samples(counter)

            #save params
            save_params(counter, params)

        # ANNEAL!
        new_lr = learning_rate.get_value() * annealing
        learning_rate.set_value(new_lr)

    # Save
    state.train_costs = train_costs
    state.valid_costs = valid_costs
    state.test_costs = test_costs

    # if test

    # 10k samples
    print 'Generating 10,000 samples'
    samples, _ = sample_some_numbers(N=10000)
    f_samples = 'samples.npy'
    numpy.save(f_samples, samples)
    print 'saved digits'

    # parzen
    print 'Evaluating parzen window'
    import likelihood_estimation_parzen
    likelihood_estimation_parzen.main(0.20, 'mnist')

    # Inpainting
    print 'Inpainting'
    test_X = test_X.get_value()

    numpy.random.seed(2)
    test_idx = numpy.arange(len(test_Y))

    for Iter in range(10):

        numpy.random.shuffle(test_idx)
        test_X = test_X[test_idx]
        test_Y = test_Y[test_idx]

        digit_idx = [(test_Y == i).argmax() for i in range(10)]
        inpaint_list = []

        for idx in digit_idx:
            DIGIT = test_X[idx:idx + 1]
            V_inpaint, H_inpaint = inpainting(DIGIT)
            inpaint_list.append(V_inpaint)

        INPAINTING = numpy.vstack(inpaint_list)

        plot_inpainting = PIL.Image.fromarray(
            tile_raster_images(INPAINTING, (root_N_input, root_N_input),
                               (10, 50)))

        fname = 'inpainting_' + str(Iter) + '.png'
        #fname   =   os.path.join(state.model_path, fname)

        plot_inpainting.save(fname)

        if False and __name__ == "__main__":
            os.system('eog inpainting.png')

    if __name__ == '__main__':
        import ipdb
        ipdb.set_trace()

    return
示例#56
0
    def test_value_s(self):

        "tests that the value of the kl divergence decreases with each update to s_i "

        model = self.model
        e_step = self.e_step
        X = self.X

        assert X.shape[0] == self.m

        init_H = e_step.init_H_hat(V = X)
        init_Mu1 = e_step.init_S_hat(V = X)

        prev_setting = config.compute_test_value
        config.compute_test_value= 'off'
        H, Mu1 = function([], outputs=[init_H, init_Mu1])()
        config.compute_test_value = prev_setting

        H = broadcast(H, self.m)
        Mu1 = broadcast(Mu1, self.m)

        H = np.cast[config.floatX](self.model.rng.uniform(0.,1.,H.shape))
        Mu1 = np.cast[config.floatX](self.model.rng.uniform(-5.,5.,Mu1.shape))


        H_var = T.matrix(name='H_var')
        H_var.tag.test_value = H
        Mu1_var = T.matrix(name='Mu1_var')
        Mu1_var.tag.test_value = Mu1
        idx = T.iscalar()
        idx.tag.test_value = 0

        S = e_step.infer_S_hat( V = X, H_hat = H_var, S_hat = Mu1_var)

        s_idx = S[:,idx]

        s_i_func = function([H_var,Mu1_var,idx],s_idx)

        sigma0 = 1. / model.alpha
        Sigma1 = e_step.infer_var_s1_hat()
        mu0 = T.zeros_like(model.mu)

        #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1
        # (they don't affect the outcome of this test and some of them are intractable )
        trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \
                     model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var, var_s0_hat = sigma0, var_s1_hat = Sigma1)

        trunc_kl_func = function([H_var, Mu1_var], trunc_kl)

        for i in xrange(self.N):
            prev_kl = trunc_kl_func(H,Mu1)

            Mu1[:,i] = s_i_func(H, Mu1, i)

            new_kl = trunc_kl_func(H,Mu1)


            increase = new_kl - prev_kl


            mx = increase.max()

            if mx > 1e-3:
                raise Exception('after mean field step in s, kl divergence should decrease, but some elements increased by as much as '+str(mx)+' after updating s_'+str(i))
示例#57
0
    def __init__(self, babi_train_raw, babi_test_raw, word2vec,
                 word_vector_size, dim, mode, input_mask_mode, memory_hops, l2,
                 normalize_attention, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()
        self.vocab = {}
        self.ivocab = {}

        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.mode = mode
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        #self.batch_size = 1
        self.l2 = l2
        self.normalize_attention = normalize_attention

        self.train_input, self.train_q, self.train_answer, self.train_choices, self.train_input_mask = self._process_input(
            babi_train_raw)
        self.test_input, self.test_q, self.test_answer, self.test_choices, self.test_input_mask = self._process_input(
            babi_test_raw)
        self.vocab_size = 4  # number of answer choices

        self.inp_var = T.matrix('input_var')
        self.q_var = T.matrix('question_var')
        self.ca_var = T.matrix('ca_var')
        self.cb_var = T.matrix('cb_var')
        self.cc_var = T.matrix('cc_var')
        self.cd_var = T.matrix('cd_var')
        self.ans_var = T.iscalar('answer_var')
        self.input_mask_var = T.ivector('input_mask_var')

        print "==> building input module"
        self.W_inp_res_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.word_vector_size)),
                                          borrow=True)
        self.W_inp_res_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_inp_res = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_inp_upd_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.word_vector_size)),
                                          borrow=True)
        self.W_inp_upd_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_inp_upd = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_inp_hid_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.word_vector_size)),
                                          borrow=True)
        self.W_inp_hid_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_inp_hid = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        inp_c_history, _ = theano.scan(fn=self.input_gru_step,
                                       sequences=self.inp_var,
                                       outputs_info=T.zeros_like(
                                           self.b_inp_hid))

        self.inp_c = inp_c_history.take(self.input_mask_var, axis=0)

        self.q_q, _ = theano.scan(fn=self.input_gru_step,
                                  sequences=self.q_var,
                                  outputs_info=T.zeros_like(self.b_inp_hid))

        self.q_q = self.q_q[-1]

        self.c_vecs = []
        for choice in [self.ca_var, self.cb_var, self.cc_var, self.cd_var]:
            history, _ = theano.scan(fn=self.input_gru_step,
                                     sequences=choice,
                                     outputs_info=T.zeros_like(self.b_inp_hid))
            self.c_vecs.append(history[-1])

        self.c_vecs = T.stack(self.c_vecs).transpose((1, 0))  # (dim, 4)
        self.inp_c = T.stack([self.inp_c] * 4).transpose(
            (1, 2, 0))  # (fact_cnt, dim, 4)
        self.q_q = T.stack([self.q_q] * 4).transpose((1, 0))  # (dim, 4)

        print "==> creating parameters for memory module"
        self.W_mem_res_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                          borrow=True)
        self.W_mem_res_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_mem_res = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_mem_upd_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                          borrow=True)
        self.W_mem_upd_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_mem_upd = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_mem_hid_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                          borrow=True)
        self.W_mem_hid_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_mem_hid = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_b = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                 borrow=True)
        self.W_1 = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, 10 * self.dim + 3)),
                                 borrow=True)
        self.W_2 = theano.shared(lasagne.init.Normal(0.1).sample(
            (1, self.dim)),
                                 borrow=True)
        self.b_1 = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                 borrow=True)
        self.b_2 = theano.shared(lasagne.init.Constant(0.0).sample((1, )),
                                 borrow=True)

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]  # (dim, 4)
        for iter in range(1, self.memory_hops + 1):
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(
                self.GRU_update_batch(memory[iter - 1], current_episode,
                                      self.W_mem_res_in, self.W_mem_res_hid,
                                      self.b_mem_res, self.W_mem_upd_in,
                                      self.W_mem_upd_hid, self.b_mem_upd,
                                      self.W_mem_hid_in, self.W_mem_hid_hid,
                                      self.b_mem_hid))

        last_mem = memory[-1].flatten()

        print "==> building answer module"
        self.W_a = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.vocab_size, 4 * self.dim)),
                                 borrow=True)
        self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem))

        print "==> collecting all parameters"
        self.params = [
            self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res,
            self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd,
            self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid,
            self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res,
            self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd,
            self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b,
            self.W_1, self.W_2, self.b_1, self.b_2, self.W_a
        ]

        print "==> building loss layer and computing updates"
        self.loss_ce = T.nnet.categorical_crossentropy(
            self.prediction.dimshuffle('x', 0), T.stack([self.ans_var]))[0]
        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.adadelta(self.loss, self.params)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.inp_var, self.q_var, self.ans_var, self.ca_var,
                    self.cb_var, self.cc_var, self.cd_var, self.input_mask_var
                ],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(inputs=[
            self.inp_var, self.q_var, self.ans_var, self.ca_var, self.cb_var,
            self.cc_var, self.cd_var, self.input_mask_var
        ],
                                       outputs=[
                                           self.prediction, self.loss,
                                           self.inp_c, self.q_q, last_mem
                                       ])

        if self.mode == 'train':
            print "==> computing gradients (for debugging)"
            gradient = T.grad(self.loss, self.params)
            self.get_gradient_fn = theano.function(inputs=[
                self.inp_var, self.q_var, self.ans_var, self.ca_var,
                self.cb_var, self.cc_var, self.cd_var, self.input_mask_var
            ],
                                                   outputs=gradient)
示例#58
0
    def __init__(self,
                 inpt,
                 wts,
                 centers,
                 rand_gen=None,
                 n_in=None,
                 n_features=None,
                 n_classes=None,
                 kind='LOGIT',
                 learn_centers=False,
                 junk_dist=np.inf,
                 reg=()):
        # wts (n_in x n_features)
        # centers (n_classesx n_features)

        assert kind in activs
        assert n_in or wts
        assert n_features or wts or centers
        assert n_classes or centers
        assert kind == 'RBF' or not learn_centers

        HiddenLayer.__init__(self,
                             inpt,
                             wts,
                             rand_gen,
                             n_in,
                             n_out=n_features,
                             actvn=activs[kind],
                             pdrop=0,
                             reg=reg)

        # Initialize centers
        if centers is None:
            if kind == 'LOGIT':
                centers_vals = rand_gen.binomial(n=1,
                                                 p=.5,
                                                 size=(n_classes, n_features))
            elif kind == 'RBF':
                centers_vals = rand_gen.uniform(low=0,
                                                high=1,
                                                size=(n_classes, n_features))
            centers = np.asarray(centers_vals, dtype=float_x)

        if is_shared_var(centers):
            self.centers = centers
        else:
            self.centers = th.shared(centers, name='centers', borrow=True)

        if learn_centers:
            self.params.append(self.centers)

        # Populate various n's based on weights
        if not n_in or not n_features:
            n_in, n_features = borrow(self.w).shape
        if not n_features or not n_classes:
            n_classes, n_features = borrow(self.centers).shape

        # c = centers; v = output of hidden layer = calculated features
        self.features = self.output  # Refers to the output of HiddenLayer
        c = self.centers.dimshuffle('x', 0, 1)
        v = self.features.dimshuffle(0, 'x', 1)
        self.kind = kind
        self.junk_dist = junk_dist

        if kind == 'LOGIT':
            # BATCH_SZ x nClasses x nFeatures >> BATCH_SZ x nClasses >> BATCH_SZ
            epsilon = .001
            v = v * (1 - 2 * epsilon) + epsilon
            self.bitprob = c * v + (1 - c) * (1 - v)
            self.logprob = tt.sum(tt.log(self.bitprob), axis=2)
            # if imp == None \
            # else T.tensordot(T.log(self.bitprob), imp, axes=([2, 0]))
            self.y_preds = tt.argmax(self.logprob, axis=1)
        elif kind == 'RBF':
            dists = tt.sum((v - c)**2, axis=2)  # BATCH_SZ x nClasses
            junk_col = junk_dist + tt.zeros_like(dists[:, 1]).dimshuffle(
                0, 'x')
            self.dists = tt.concatenate([dists, junk_col], axis=1)
            self.probs = tt.nnet.softmax(-self.dists)  # BATCH_SZ x nClasses+1
            self.logprob = tt.log(self.probs)
            self.y_preds = tt.argmax(self.probs, axis=1)

        self.representation = (
            'CenteredOut Kind:{} In:{:3d} Hidden:{:3d} '
            'Out:{:3d} learn_centers:{} junk_dist:{}'.format(
                kind, n_in, n_features, n_classes, learn_centers, junk_dist))
示例#59
0
    def __call__(self,
                 x,
                 y,
                 qk=None,
                 n_posterior_samples=10,
                 pass_gradients=False,
                 reweight=False,
                 reweight_gen_only=False,
                 sleep_phase=False):
        '''Call function.

        Calculates the lower bound, log marginal, and other useful quantities.
        If this is TMI for your needs, just omit what you don't need from the
        final graph.

        Args:
            x: T.tensor, input to recogntion network.
            y: T.tensor, output from conditional.
            qk: T.tensor (optional), approximate posterior parameters.
                If None, calculate from recognition network.
            n_posterior_samples: int, number of samples to use for lower bound
                and log marginal estimates.
            pass_gradients: bool, for priors with continuous distributions,
                this can facilitate learning. Otherwise, q_k should be provided.
            reweight: bool. If true, then reweight samples for estimates.
        Returns:
            results: OrderedDict, float results.
            samples: OrderedDict, array results
                (such as samples from conditional).
            updates: OrderedUpdates.
            constants: list, for omitting quantities from passing gradients.
        '''
        constants = []
        results = OrderedDict()
        q0 = self.posterior.feed(x)
        if qk is None:
            qk = q0
        elif not pass_gradients:
            constants.append(qk)

        r = self.init_inference_samples(
            (n_posterior_samples, y.shape[0], self.dim_h))
        h = self.posterior.distribution.step_sample(r, qk[None, :, :])
        py_h = self.conditional.feed(h)

        log_py_h = -self.conditional.neg_log_prob(y[None, :, :], py_h)
        log_ph = -self.prior.neg_log_prob(h)
        log_qh0 = -self.posterior.neg_log_prob(h, q0[None, :, :])
        log_qhk = -self.posterior.neg_log_prob(h, qk[None, :, :])
        prior_entropy = self.prior.entropy()
        q_entropy = self.posterior.entropy(qk)

        # Log marginal
        log_p = log_sum_exp(log_py_h + log_ph - log_qhk,
                            axis=0) - T.log(n_posterior_samples)

        recon_term = -log_py_h

        # Some prior distributions have a tractable KL divergence.
        if self.prior.has_kl and not reweight and not reweight_gen_only:
            KL_qk_p = self.prior.kl_divergence(qk)
            results['KL(q_k||p)'] = KL_qk_p
            KL_term = KL_qk_p
        else:
            prior_energy = -log_ph
            results['-log p(h)'] = prior_energy.mean()
            KL_term = prior_energy - q_entropy

        # If we pass the gradients we don't want to include the KL(q_k||q_0)
        if not pass_gradients:
            if self.posterior.distribution.has_kl and not reweight and not reweight_gen_only:
                KL_qk_q0 = self.posterior.distribution.step_kl_divergence(
                    qk, *self.posterior.distribution.split_prob(q0))
                results['KL(q_k||q_0)'] = KL_qk_q0
                posterior_term = KL_qk_q0
            else:
                results['-log q(h)'] = -log_qh0.mean()
                posterior_term = -log_qh0
        else:
            posterior_term = T.zeros_like(log_qh0)

        lower_bound = -(recon_term + KL_term).mean()

        w_tilde = get_w_tilde(log_py_h + log_ph - log_qhk)
        results['log ESS'] = T.log(1. / (w_tilde**2).sum(0)).mean()
        if sleep_phase:
            r = self.init_inference_samples(
                (n_posterior_samples, y.shape[0], self.dim_h))
            h_s = self.prior.step_sample(
                r, self.prior.get_prob(*self.prior.get_params()))
            py_h_s = self.conditional.feed(h_s)
            y_s, _ = self.conditional.sample(py_h_s)
            constants.append(y_s)
            q0_s = self.posterior.feed(y_s[0])
            log_qh0 = -self.posterior.neg_log_prob(h_s, q0_s)
            cost = -((w_tilde * (log_py_h + log_ph)).sum(
                (0, 1)) + log_qh0.sum(1).mean(0))
            constants.append(w_tilde)
        elif reweight:
            cost = -(w_tilde * (log_py_h + log_ph + log_qh0)).sum((0, 1))
            constants.append(w_tilde)
        elif reweight_gen_only:
            cost = -((w_tilde * (log_py_h + log_ph)).sum(
                (0, 1)) + log_qh0.sum(1).mean(0))
            constants.append(w_tilde)
        else:
            cost = (recon_term + KL_term + posterior_term).sum(1).mean(0)

        results.update(
            **{
                '-log p(x|h)': recon_term.mean(),
                '-log p(x)': -log_p.mean(0),
                'H(p)': prior_entropy,
                'H(q)': q_entropy.mean(0),
                'lower_bound': lower_bound,
                'cost': cost
            })

        samples = OrderedDict(py=py_h,
                              batch_energies=recon_term,
                              w_tilde=w_tilde)

        return results, samples, constants, theano.OrderedUpdates()
示例#60
0
    def __init__(self, size, a=0.1, b=0.2, c=-65.0, d=2.0):
        self.scheduler = Scheduler(size)
        self.size = size

        v_peak = 30.0
        tau = 0.5

        self.v = v = theano.shared(np.full(size, c, dtype=floatX),
                                   name="v",
                                   borrow=True)
        self.u = u = theano.shared(np.full(size, b * c, dtype=floatX),
                                   name="u",
                                   borrow=True)
        self.I = I = theano.shared(np.zeros(size, dtype=floatX),
                                   name="I",
                                   borrow=True)

        dv = tau * (0.04 * (v * v) + (v * 5.0) + 140.0 - u + I)
        du = tau * (a * ((b * v) - u))

        now = T.iscalar("now")
        DC = T.vector("DC")
        spikes = T.vector("spikes")
        schedule = T.vector("schedule")

        self.recv = theano.function([DC, schedule],
                                    I,
                                    updates=[(I, I + DC + schedule)])
        self.tick_v = theano.function([], v, updates=[(v, v + dv)])
        self.tick_u = theano.function([], u, updates=[(u, u + du)])
        self.threshold = theano.function([], v >= v_peak)
        self.reset = theano.function([spikes], [v, u, I],
                                     updates=[
                                         (v, T.switch(spikes, c, v)),
                                         (u, T.switch(spikes, u + d, u)),
                                         (I, T.zeros_like(I)),
                                     ])

        window_size = 40
        rate_mul = 1000.0 / window_size

        self.spike_counter = spike_counter = theano.shared(
            np.zeros((window_size, size), dtype=floatX),
            name="spike_counter",
            borrow=True)
        self.rate = rate = theano.shared(np.zeros(size, dtype=floatX),
                                         name="rate",
                                         borrow=True)

        self.count_spikes = theano.function(
            [now, spikes],
            spike_counter,
            updates=[(spike_counter,
                      T.set_subtensor(spike_counter[now % window_size],
                                      spikes))],
            name="count_spikes")

        self.sum_rate = theano.function(
            [],
            rate,
            updates=[(rate, T.sum(spike_counter, axis=0) * rate_mul)])