Пример #1
0
    def backward_V_step(rewards,
                        is_end,
                        next_Vpred,
                        is_tmax,
                        next_Vref,
                        *args #you won't dare delete me
                       ):
        """scan inner computation step, going backwards in time
        params:
        rewards, is_alive, next_Vpred, time_i - sequences
        next_Vref - recurrent state value for next turn
        
        returns:
            current_Vref - recurrent state value at this turn
            
            current_Vref is computed thus:
                Once every n_steps or at session end:
                    current_Vref = r + gamma*next_Vpred   #computation through next predicted state value
                Otherwise:
                    current_Vref = r + gamma*next_Vref    #recurrent computation through next Qvalue
            
        """

        propagated_Vref = rewards + gamma_or_gammas * next_Vref  # propagates value from actual next action
        optimal_Vref = rewards + gamma_or_gammas * next_Vpred  # uses agent's prediction for next state

        # pick new_Vref if is_Tmax, else propagate existing one
        chosen_Vref = T.switch(is_tmax, optimal_Vref, propagated_Vref)

        # zero out references if session has ended already
        this_Vref = T.switch(is_end, rewards,chosen_Vref)

        return this_Vref
Пример #2
0
    def get_gradients(self, model, data, ** kwargs):

        cost = self.expr(model=model, data=data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore')

        gradients = OrderedDict(izip(params, grads))

        if self.gradient_clipping:
            norm_gs = 0.
            for grad in gradients.values():
                norm_gs += (grad ** 2).sum()
            not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
            norm_gs = T.sqrt(norm_gs)
            norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude),
                               self.max_magnitude / norm_gs,
                               1.)

            for param, grad in gradients.items():
                gradients[param] = T.switch(not_finite,
                                            .1 * param,
                                            grad * norm_gs)

        updates = OrderedDict()

        return gradients, updates
Пример #3
0
        def pass_edges(input_idx_t, edge_t, edge_mask_t, counter_t, h_tm1, c_tm1, x):
            h_t = h_tm1
            c_t = c_tm1
            # select the input vector to use for this edge (source)
            x_t_i = x[input_idx_t, :]
            # zero out the input unless this is a leaf node
            x_t_0 = T.switch(T.eq(T.sum(edge_mask_t), 0), x_t_i, x_t_i*0)
            # concatenate with the input edge vector
            x_t_edge = T.concatenate([x_t_0, edge_t])

            # compute attention weights, using a manual softmax
            attention_scores = T.dot(self.v_a, T.tanh(T.dot(self.W_h_a, h_tm1))) # (1, n_edges)
            # find the max of the unmasked values
            max_score = T.max(attention_scores + edge_mask_t * 10000.0) - 10000.0
            # exponentiate the differences, masking first to avoid inf, and then to keep only relevant scores
            exp_scores = T.exp((attention_scores - max_score) * edge_mask_t) * edge_mask_t
            # take the sum, and add one if the mask is all zeros to avoid an inf
            exp_scores_sum = T.sum(exp_scores) + T.switch(T.eq(T.sum(edge_mask_t), 0), 1.0, 0.0)
            # normalize to compute the weights
            weighted_mask = exp_scores / exp_scores_sum

            i_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_i) + T.sum(T.dot(self.W_h_i.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_i)
            f_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_f) + T.sum(T.dot(self.W_h_f.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_f)
            o_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_o) + T.sum(T.dot(self.W_h_o.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_o)
            u_t = T.tanh(T.dot(x_t_edge, self.W_x_u) + T.sum(T.dot(self.W_h_u.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_u)

            c_temp = i_t * u_t + f_t * T.sum((weighted_mask * c_tm1).T, axis=0)
            h_temp = o_t * T.tanh(c_temp)

            h_t = T.set_subtensor(h_t[:, counter_t], h_temp)
            c_t = T.set_subtensor(c_t[:, counter_t], c_temp)
            return h_t, c_t
Пример #4
0
Файл: render.py Проект: zenna/ig
def mindist(translate, min_so_far, ro, rd):
    # ro: 3
    # transalate: nbatch * 3
    # min_so_far: nbatch * width * height
    # rd: width * height * 3
    ro = ro + translate
    # d_o = T.dot(rd, ro)   # 640, 480
    # d_o = dotty(rd, ro, axis=1)
    d_o = T.tensordot(rd, ro, axes=[2,1])
    o_o =  T.sum(ro**2,axis=1)
    b = 2*d_o
    c = o_o - 0.001 #FIXME, remove this squaring
    inner = b **2 - 4 * c   # 640 480
    does_not_intersect = inner < 0.0
    minus_b = -b
    # sqrt_inner = T.sqrt(T.maximum(0.0001, inner))
    eps = 1e-9
    background_dist = 10.0
    sqrt_inner = T.sqrt(T.maximum(eps, inner))
    root1 = (minus_b - sqrt_inner)/2.0
    root2 = (minus_b + sqrt_inner)/2.0
    depth = T.switch(does_not_intersect, background_dist,
                        T.switch(root1 > 0, root1,
                        T.switch(root2 > 0, root2, background_dist)))
    return T.min([min_so_far, depth], axis=0)
Пример #5
0
 def backward_V_step(rewards,is_alive,next_Vpred,time_i, 
                     next_Vref,
                     *args):
     
     
     
     propagated_Vref = T.switch(is_alive,
                        rewards + gamma_or_gammas * next_Vref, #assumes optimal next action
                        0.
                       )
     
     if n_steps is None:
         this_Vref = propagated_Vref
     else:
         
         Vref_at_tmax = T.switch(is_alive,
                        rewards + gamma_or_gammas *next_Vpred,
                        0.
                       )
         
         this_Vref = T.switch(T.eq(time_i % n_steps,0), #if Tmax
                                     Vref_at_tmax,  #use special case values
                                     propagated_Vref #else use generic ones
                                )
                                      
                              
     
     
     
     return this_Vref
Пример #6
0
    def __init__(self, random_state=None, low=0.0, high=1.0):
        super(Uniform, self).__init__(low=low, high=high,
                                      random_state=random_state,
                                      optimizer=None)

        # pdf
        self.pdf_ = T.switch(
            T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)),
            0.,
            1. / (self.high - self.low)).ravel()
        self.make_(self.pdf_, "pdf")

        # -log pdf
        self.nnlf_ = T.switch(
            T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)),
            np.inf,
            T.log(self.high - self.low)).ravel()
        self.make_(self.nnlf_, "nnlf")

        # cdf
        self.cdf_ = T.switch(
            T.lt(self.X, self.low),
            0.,
            T.switch(
                T.lt(self.X, self.high),
                (self.X - self.low) / (self.high - self.low),
                1.)).ravel()
        self.make_(self.cdf_, "cdf")

        # ppf
        self.ppf_ = self.p * (self.high - self.low) + self.low
        self.make_(self.ppf_, "ppf", args=[self.p])
Пример #7
0
 def s_logprior(self, s_params, strength=10.0):
     # -- I don't know what distribution this would be
     #    but I think it makes a nice shape
     s_alpha, s_cond_x, s_cond_y = self.unpack(s_params)
     n_alpha_min = self._alpha_from_l(self._lenscales_min)
     n_alpha_max = self._alpha_from_l(self._lenscales_max)
     #return strength * (alpha - alpha_min) ** 2
     log0 = -10000
     width = n_alpha_max - n_alpha_min
     #alpha_mean = 0.5 * (alpha_max + alpha_min)
     energy = strength * 0.5 * (s_alpha - n_alpha_max) ** 2 / width ** 2
     lenscale_logprior = TT.switch(s_alpha < n_alpha_min,
                                   log0,
                                   TT.switch(s_alpha < n_alpha_max,
                                             -energy,
                                             log0)).sum()
     if self._conditional:
         diff_x = s_cond_x
         diff_y = s_cond_y - 1
         rval = (lenscale_logprior
                 + TT.dot(diff_x, diff_x)
                 + TT.dot(diff_y, diff_y))
     else:
         rval = lenscale_logprior
     assert rval.ndim == 0
     return rval
Пример #8
0
def T_subspacel1_slow_shrinkage(a,L,lam_sparse,lam_slow,small_value=.001):
    amp = T.sqrt(a[::2,:]**2 + a[1::2,:]**2 + small_value)
    #damp = amp[:,1:] - amp[:,:-1]

    # compose slow shrinkage with subspace l1 shrinkage

    # slow shrinkage
    div = T.zeros_like(amp)
    d1 = amp[:,1:] - amp[:,:-1]
    d2 = d1[:,1:] - d1[:,:-1]
    div = T.set_subtensor(div[:,1:-1],-d2)
    div = T.set_subtensor(div[:,0], -d1[:,0])
    div = T.set_subtensor(div[:,-1], d1[:,-1])
    slow_amp_shrinkage = 1 - (lam_slow/L)*(div/amp)
    slow_amp_value = T.switch(T.gt(slow_amp_shrinkage,0),slow_amp_shrinkage,0)
    slow_shrinkage_prox_a = slow_amp_value*a[::2,:]
    slow_shrinkage_prox_b = slow_amp_value*a[1::2,:]

    # subspace l1 shrinkage
    amp_slow_shrinkage_prox = T.sqrt(slow_shrinkage_prox_a**2 + slow_shrinkage_prox_b**2)
    #amp_shrinkage = 1. - (lam_slow*lam_sparse/L)*amp_slow_shrinkage_prox
    amp_shrinkage = 1. - (lam_sparse/L)/amp_slow_shrinkage_prox
    amp_value = T.switch(T.gt(amp_shrinkage,0.),amp_shrinkage,0.)
    subspacel1_prox = T.zeros_like(a)
    subspacel1_prox = T.set_subtensor(subspacel1_prox[ ::2,:],amp_value*slow_shrinkage_prox_a)
    subspacel1_prox = T.set_subtensor(subspacel1_prox[1::2,:],amp_value*slow_shrinkage_prox_b)
    return subspacel1_prox
Пример #9
0
	def create_cost_fun (self):

		# create a cost function that
		# takes each prediction at every timestep
		# and guesses next timestep's value:
		what_to_predict = self.input_mat[:, 1:]
		# because some sentences are shorter, we
		# place masks where the sentences end:
		# (for how long is zero indexed, e.g. an example going from `[2,3)`)
		# has this value set 0 (here we substract by 1):
		for_how_long = self.for_how_long - 1
		# all sentences start at T=0:
		starting_when = T.zeros_like(self.for_how_long)
								 
		self.lstm_cost = masked_loss(self.lstm_predictions,
								what_to_predict,
								for_how_long,
								starting_when).sum()

		zero_entropy = T.zeros_like(self.entropy)
		real_entropy = T.switch(self.mask_matrix,self.entropy,zero_entropy)
		zero_key_entropy = T.zeros_like(self.key_entropy)
		real_key_entropy = T.switch(self.mask_matrix,self.key_entropy,zero_key_entropy)

		self.final_cost = masked_loss(self.final_predictions,
								what_to_predict,
								for_how_long,
								starting_when).sum()+self.entropy_reg*real_entropy.sum()+self.key_entropy_reg*real_key_entropy.sum()
Пример #10
0
    def train_fprop(self, X, wts=None, bs=None):
        ''' Performs forward propagation with for training, which could be different from
        the vanilla frprop we would use for testing, due to extra bells and whistles such as 
        dropout, corruption, etc'''

        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_

        if 'dropout' in self.loss_terms:
            input_p = self.loss_params['input_p']
            hidden_p = self.loss_params['hidden_p']

            # compute the first activation separately in case we have no hidden
            # layer;
            act = self.activs[0](
                T.dot(self.dropout(X, input_p), wts[0]) + bs[0])
            if len(wts) > 1:  # len(wts) = 1 corresponds to softmax regression
                for i, (w, b, activ) in enumerate(zip(wts[1:], bs[1:], self.activs[1:])):
                    act = activ(T.dot(self.dropout(act, hidden_p), w) + b)

            eps = 1e-6
            act = T.switch(act < eps, eps, act)
            act = T.switch(act > (1. - eps), (1. - eps), act)

            return act
        else:
            return self.fprop(X, wts, bs)
Пример #11
0
def T_subspacel1_slow_shrinkage_conv(a, L, lam_sparse, lam_slow, imshp,kshp,featshp,stride=(1,1),small_value=.001):
    featshp = (imshp[0],kshp[0],featshp[2],featshp[3]) # num images, features, szy, szx
    features = T.reshape(T.transpose(a),featshp,ndim=4)

    amp = T.sqrt(features[:,::2,:,:]**2 + features[:,1::2,:,:]**2 + small_value)
    #damp = amp[:,1:] - amp[:,:-1]

    # compose slow shrinkage with subspace l1 shrinkage

    # slow shrinkage
    div = T.zeros_like(amp)
    d1 = amp[1:,:,:,:] - amp[:-1,:,:,:]
    d2 = d1[1:,:,:,:] - d1[:-1,:,:,:]
    div = T.set_subtensor(div[1:-1,:,:,:], -d2)
    div = T.set_subtensor(div[0,:,:,:], -d1[0,:,:,:])
    div = T.set_subtensor(div[-1,:,:,:], d1[-1,:,:,:])
    slow_amp_shrinkage = 1 - (lam_slow / L) * (div / amp)
    slow_amp_value = T.switch(T.gt(slow_amp_shrinkage, 0), slow_amp_shrinkage, 0)
    slow_shrinkage_prox_a = slow_amp_value * features[:, ::2, :,:]
    slow_shrinkage_prox_b = slow_amp_value * features[:,1::2, :,:]

    # subspace l1 shrinkage
    amp_slow_shrinkage_prox = T.sqrt(slow_shrinkage_prox_a ** 2 + slow_shrinkage_prox_b ** 2)
    #amp_shrinkage = 1. - (lam_slow*lam_sparse/L)*amp_slow_shrinkage_prox
    amp_shrinkage = 1. - (lam_sparse / L) / amp_slow_shrinkage_prox
    amp_value = T.switch(T.gt(amp_shrinkage, 0.), amp_shrinkage, 0.)
    subspacel1_prox = T.zeros_like(features)
    subspacel1_prox = T.set_subtensor(subspacel1_prox[:, ::2, :,:], amp_value * slow_shrinkage_prox_a)
    subspacel1_prox = T.set_subtensor(subspacel1_prox[:,1::2, :,:], amp_value * slow_shrinkage_prox_b)

    reshape_subspacel1_prox = T.transpose(T.reshape(subspacel1_prox,(featshp[0],featshp[1]*featshp[2]*featshp[3]),ndim=2))
    return reshape_subspacel1_prox
Пример #12
0
def castray(ro, rd, shape_params, nprims, width, height):
    tmin = 1.0
    tmax = 20.0
    precis = 0.002
    m = -1.0
    # There are a sequence of distances, d1, d2, ..., dn
    # then theres the accumulated distances d1, d1+d2, d1+d2+d3....
    # What we actually want in the output is the sfor each ray the distance to the surface
    # So we want something like 0, 20, 25, 27, 28, 28, 28, 28, 28
    # OK

    max_num_steps = 25

    # distcolors = map(ro + rd * 0, width, height) #FIXME, reshape instead of mul by 0
    distcolors = mapedit(ro + rd * 0, shape_params, nprims, width, height)
    dists = distcolors
    steps = T.switch(dists < precis, T.zeros_like(dists), T.ones_like(dists))
    accum_dists = T.reshape(dists, (width, height, 1))

    for i in range(max_num_steps - 1):
        # distcolors = map(ro + rd * accum_dists, width, height) #FIXME, reshape instead of mul by 0
        distcolors = mapedit(ro + rd * accum_dists, shape_params, nprims, width, height) #FIXME, reshape instead of mul by 0
        dists = distcolors
        steps = steps + T.switch(dists < precis, T.zeros_like(dists), T.ones_like(dists))
        accum_dists = accum_dists + T.reshape(dists, (width, height, 1))

    last_depth = T.reshape(accum_dists, (width, height))
    depthmap = T.switch(last_depth < tmax, last_depth / tmax, T.zeros_like(last_depth))
    color = 1.0 - steps / float(max_num_steps)
    # Distance marched along ray and delta between last two steps
    return depthmap
Пример #13
0
    def mcmc(ll, *frvs):
        full_observations = dict(observations)
        full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, frvs)]))
        
        loglik = -full_log_likelihood(full_observations)

        proposals = free_RVs_prop
        H = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + loglik

# -- this should be an inner loop
        g = []
        g.append(tensor.grad(loglik, frvs))
        
        proposals = [(p - epsilon*gg[0]/2.) for p, gg in zip(proposals, g)]

        rvsp = [(rvs + epsilon*rvp) for rvs,rvp in zip(frvs, proposals)]
        
        full_observations = dict(observations)
        full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, rvsp)]))
        new_loglik = -full_log_likelihood(full_observations)
        
        gnew = []
        gnew.append(tensor.grad(new_loglik, rvsp))
        proposals = [(p - epsilon*gn[0]/2.) for p, gn in zip(proposals, gnew)]
# --
        
        Hnew = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + new_loglik

        dH = Hnew - H
        accept = tensor.or_(dH < 0., U < tensor.exp(-dH))

        return [tensor.switch(accept, -new_loglik, ll)] + \
            [tensor.switch(accept, p, f) for p, f in zip(rvsp, frvs)], \
            {}, theano.scan_module.until(accept)
Пример #14
0
    def convert_method(self, method_string):

        if method_string == 'sigmoid':
            return Tensor.nnet.sigmoid

        elif method_string == 'tanh':
            return Tensor.tanh

        elif method_string == 'scaled_tanh':
            return lambda x: 1.7159 * Tensor.tanh(0.66 * x)

        elif method_string == 'soft_sigmoid':
            return soft_sigmoid

        elif method_string == 'relu':
            return lambda x: x * (x > 0)

        elif method_string == 'relu2':
            return lambda x: Tensor.switch(Tensor.lt(x, -1), -1, x) * Tensor.switch(Tensor.gt(x, 1), 1, x) / x

        elif method_string == 'leakyrelu':
            return lambda x: x * (x > 0) + 0.01 * x * (x < 0)

        elif method_string == 'shiftedrelu':
            return lambda x: x * (x > -1)

        elif method_string == 'hard_sigmoid':
            return Tensor.nnet.hard_sigmoid

        elif method_string == 'none':
            return lambda x: x

        else:
            raise Exception('method unknown')
Пример #15
0
    def out_shape(imgshape, ds, ignore_border=False):
        """Return the shape of the output from this op, for input of given shape and flags.

        :param imgshape: the shape of a tensor of images. The last two elements are interpreted
        as the number of rows, and the number of cols.
        :type imgshape: tuple, list, or similar of integer or
        scalar Theano variable.

        :param ds: downsample factor over rows and columns
        :type ds: list or tuple of two ints

        :param ignore_border: if ds doesn't divide imgshape, do we include an extra row/col of
        partial downsampling (False) or ignore it (True).
        :type ignore_border: bool

        :rtype: list
        :returns: the shape of the output from this op, for input of given shape.  This will
        have the same length as imgshape, but with last two elements reduced as per the
        downsampling & ignore_border flags.
        """
        if len(imgshape) < 2:
            raise TypeError("imgshape must have at least two elements (rows, cols)")
        r, c = imgshape[-2:]
        rval = list(imgshape[:-2]) + [r // ds[0], c // ds[1]]

        if not ignore_border:
            if isinstance(r, theano.Variable):
                rval[-2] = tensor.switch(r % ds[0], rval[-2] + 1, rval[-2])
            elif r % ds[0]:
                rval[-2] += 1
            if isinstance(c, theano.Variable):
                rval[-1] = tensor.switch(c % ds[1], rval[-1] + 1, rval[-1])
            elif c % ds[1]:
                rval[-1] += 1
        return rval
Пример #16
0
def multiple_switch(*args):
    """
    .. todo::

        WRITEME properly

    Applies a cascade of ifelse. The output will be a Theano expression
    which evaluates:

    .. code-block:: none

        if args0:
            then arg1
        elif arg2:
            then arg3
        elif arg4:
            then arg5
        ....
    """
    if len(args) == 3:
        return T.switch(*args)
    else:
        return T.switch(args[0],
                        args[1],
                        multiple_switch(*args[2:]))
Пример #17
0
    def theano_sentence_prediction(self, Sentence, Chars, WordLengths):

        input_lstm_res_f = self.input_lstm_forward_layer.function(Sentence, Chars, WordLengths)
        input_lstm_res_b = self.input_lstm_backward_layer.function(Sentence, Chars, WordLengths)
        input_combined = T.concatenate((input_lstm_res_f, input_lstm_res_b), axis=1)

        #Make pairwise features. This is really just "tensor product with concatenation instead of multiplication". Is there a command for that?
        full_matrix, _ = theano.scan(fn=self.__pairwise_features,
                                  outputs_info=None,
                                  sequences=input_combined,
                                  non_sequences=[input_combined, Sentence.shape[0]])

        if len(self.lstm_layers) > 0 and self.lstm_layers[0].training:
            srng = RandomStreams(seed=12345)
            full_matrix = T.switch(srng.binomial(size=(Sentence.shape[0], Sentence.shape[0]+1, self.hidden_dimension*4), p=0.5), full_matrix, 0)
        else:
            full_matrix = 0.5 * full_matrix

        full_matrix = self.transition_layer.function(full_matrix)
            
        for layer in self.lstm_layers:
            if layer.training:
                print("hah-train")
                full_matrix = T.switch(srng.binomial(size=(Sentence.shape[0], Sentence.shape[0]+1, self.hidden_dimension*4), p=0.5), full_matrix, 0)
            else:
                print("heh-notrain")
                full_matrix = 0.5 * full_matrix
            
            
            full_matrix = layer.function(full_matrix)
        
        final_matrix = self.output_convolution.function(full_matrix)

        return T.nnet.softmax(final_matrix)
Пример #18
0
def hmc_updates(positions, stepsize, avg_acceptance_rate, final_pos, accept,
                target_acceptance_rate, stepsize_inc, stepsize_dec,
                stepsize_min, stepsize_max, avg_acceptance_slowness):
    
  
    # broadcast `accept` scalar to tensor with the same dimensions as final_pos.
    accept_matrix = accept.dimshuffle(0, *(('x',) * (final_pos.ndim - 1)))

    # if accept is True, update to `final_pos` else stay put
    new_positions = TT.switch(accept_matrix, final_pos, positions)

   
    ## STEPSIZE UPDATES ##
    # if acceptance rate is too low, our sampler is too "noisy" and we reduce
    # the stepsize. If it is too high, our sampler is too conservative, we can
    # get away with a larger stepsize (resulting in better mixing).
    _new_stepsize = TT.switch(avg_acceptance_rate > target_acceptance_rate,
                              stepsize * stepsize_inc, stepsize * stepsize_dec)
                              
    # maintain stepsize in [stepsize_min, stepsize_max]
    new_stepsize = TT.clip(_new_stepsize, stepsize_min, stepsize_max)

   
    # perform exponential moving average
    mean_dtype = theano.scalar.upcast(accept.dtype, avg_acceptance_rate.dtype)
    new_acceptance_rate = TT.add(
        avg_acceptance_slowness * avg_acceptance_rate,
        (1.0 - avg_acceptance_slowness) * accept.mean(dtype=mean_dtype))

    return [(positions, new_positions), (stepsize, new_stepsize), (avg_acceptance_rate, new_acceptance_rate)]
Пример #19
0
 def step(self, inputs, inputs_mask, h_tm1, c_tm1, h_mask, *non_sequences):
     if self.gate.use_attention:
         # attended is the
         #   src_sequence_length x batch_size x attention_dims
         # matrix which we have attention on.
         #
         # attended_dot_u is the h_t-independent part of the final
         # attention vectors, which is precomputed for efficiency.
         #
         # attention_mask is a binary mask over the valid elements of
         # attended, which in practice is the same as the mask passed to
         # the encoder that created attended. Size
         #   src_sequence_length x batch_size
         h_t, c_t, attention = self.gate(
                 inputs, h_tm1 * h_mask.astype(theano.config.floatX), c_tm1,
                 attended=non_sequences[0],
                 attended_dot_u=non_sequences[1],
                 attention_mask=non_sequences[2])
         return (T.switch(inputs_mask.dimshuffle(0, 'x'), h_t, h_tm1),
                 T.switch(inputs_mask.dimshuffle(0, 'x'), c_t, c_tm1),
                 attention)
     else:
         h_t, c_t = self.gate(
                 inputs, h_tm1 * h_mask.astype(theano.config.floatX), c_tm1)
         return (T.switch(inputs_mask.dimshuffle(0, 'x'), h_t, h_tm1),
                 T.switch(inputs_mask.dimshuffle(0, 'x'), c_t, c_tm1))
Пример #20
0
    def cd_updates(self):
        """
        Return a dictionary of shared variable updates that implements contrastive divergence
        learning by stochastic gradient descent with an annealed learning rate.
        """

        ups = {}

        if self.persistent_chains:
            grads = self.contrastive_grads()
            ups.update(dict(self.sampler.updates()))
        else:
            cd1_sampler, final_p, cd1_updates = self.rbm.CD1_sampler(self.visible_batch,
                    self.batchsize)
            self._last_cd1_sampler = cd1_sampler # hacked in here for the unit test
            #ignore the cd1_sampler
            grads = self.contrastive_grads(neg_v = final_p)
            ups.update(dict(cd1_updates))


        # contrastive divergence updates
        # TODO: sgd_updates is a particular optization algo (others are possible)
        #       parametrize so that algo is plugin
        #       the normalization normVF might be sgd-specific though...

        # TODO: when sgd has an annealing schedule, this should
        #       go through that mechanism.

        lr = TT.clip(
                self.learn_rate * TT.cast(self.lr_anneal_start / (self.iter+1), floatX),
                0.0, #min
                self.learn_rate) #max

        ups.update(dict(sgd_updates(
                    self.rbm.params(),
                    grads,
                    stepsizes=[a*lr for a in self.learn_rate_multipliers])))

        ups[self.iter] = self.iter + 1


        # add trainer updates (replace CD update of U)
        ups[self.rbm.U], ups[self.normVF] = self.normalize_U(ups[self.rbm.U])

        #l1_updates:
        if (self.l1_penalty_start > 0) and (self.l1_penalty != 0.0):
            ups[self.effective_l1_penalty] = TT.switch(
                    self.iter >= self.l1_penalty_start,
                    self.l1_penalty,
                    0.0)

        if getattr(self,'p_lr', None):
            ups[self.p_lr] = TT.switch(self.iter > self.p_training_start,
                    self.p_training_lr,
                    0)
            new_P = ups[self.rbm.P] * self.p_mask
            no_pos_P = TT.switch(new_P<0, new_P, 0)
            ups[self.rbm.P] = - no_pos_P / no_pos_P.sum(axis=0) #normalize to that columns sum 1

        return ups
Пример #21
0
    def __call__(self, input_):
        m = input_.mean()
        v = input_.std()

        new_m = T.switch(T.eq(self.m, 0.),
                         m,
                         (np.float32(1.) - self.rate) * self.m + self.rate * m)
        new_var = T.switch(T.eq(self.var, 0.),
                           v,
                           (np.float32(1.) - self.rate) * self.var + self.rate * v)

        updates = [(self.m, new_m), (self.var, new_var)]

        input_centered = (
            (input_ - new_m) / T.maximum(1., T.sqrt(new_var)))

        input_ = T.zeros_like(input_) + input_

        outs = OrderedDict(
            x=input_,
            x_centered=input_centered,
            m=new_m,
            var=new_var
        )
        return outs, updates
Пример #22
0
    def compute_updates(self, training_cost, params):
        updates = []
         
        grads = T.grad(training_cost, params)
        grads = OrderedDict(zip(params, grads))
        
        # Clip stuff
        c = numpy.float32(self.cutoff)
        clip_grads = []
        
        norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items()))
        normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.))
        notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
         
        for p, g in grads.items():
            clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization)))
        
        grads = OrderedDict(clip_grads)

        if self.updater == 'adagrad':
            updates = Adagrad(grads, self.lr)  
        elif self.updater == 'sgd':
            raise Exception("Sgd not implemented!")
        elif self.updater == 'adadelta':
            updates = Adadelta(grads)
        elif self.updater == 'rmsprop':
            updates = RMSProp(grads, self.lr)
        elif self.updater == 'adam':
            updates = Adam(grads)
        else:
            raise Exception("Updater not understood!") 
        return updates
Пример #23
0
    def _activation(self, Y, L, M, W):
        """Returns the activation for a given input.

        Derived from the generative model formulation of hierarchical
        Poisson mixtures, the formular for the activation in the network
        reads as follows:
        I_c =
         \sum_d \log(W_{cd})y_d + \log(M_{lc})        for labeled data
         \sum_d \log(W_{cd})y_d + \log(\sum_k M_{kc}) for unlabeled data
        s_c = softmax(I_c)
        """
        # first: complete inference to find label
        # Input integration:
        I = T.tensordot(Y,T.log(W),axes=[1,1])
        # recurrent term:
        vM = M[L]
        L_index = T.eq(L,-1).nonzero()
        vM = T.set_subtensor(vM[L_index], T.sum(M, axis=0))
        # numeric trick to prevent overflow in the exp-function
        max_exponent = 86. - T.ceil(T.log(I.shape[1].astype('float32')))
        scale = T.switch(
            T.gt(T.max(I, axis=1, keepdims=True), max_exponent),
            T.max(I, axis=1, keepdims=True) - max_exponent,
            0.)
        # numeric approximation to prevent underflow in the exp-function:
        # map too low values of I to a fixed minimum value
        min_exponent = -87. + T.ceil(T.log(I.shape[1].astype('float32')))
        I = T.switch(
            T.lt(I-scale, min_exponent),
            scale+min_exponent,
            I)
        # activation: recurrent softmax with overflow protection
        s = vM*T.exp(I-scale)/T.sum(vM*T.exp(I-scale), axis=1, keepdims=True)
        return s
Пример #24
0
def adamgc(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, max_magnitude=5.0, infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)
    
    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.)

    i = shared(floatX(0.))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.)
        v = shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m) 
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm
Пример #25
0
def _get_targets(y, log_y_hat, y_mask, y_hat_mask):
    '''
    Returns the target values according to the CTC cost with respect to y_hat.
    Note that this is part of the gradient with respect to the softmax output
    and not with respect to the input of the original softmax function.
    All computations are done in log scale
    '''
    num_classes = log_y_hat.shape[2] - 1
    blanked_y, blanked_y_mask = _add_blanks(
        y=y,
        blank_symbol=num_classes,
        y_mask=y_mask)

    log_alpha, log_beta = _log_forward_backward(blanked_y,
                                                log_y_hat, blanked_y_mask,
                                                y_hat_mask, num_classes)
    # explicitly not using a mask to prevent inf - inf
    y_prob = _class_batch_to_labeling_batch(blanked_y, log_y_hat,
                                            y_hat_mask=None)
    marginals = log_alpha + log_beta - y_prob
    max_marg = marginals.max(2)
    max_marg = T.switch(T.le(max_marg, -np.inf), 0, max_marg)
    log_Z = T.log(T.exp(marginals - max_marg[:,:, None]).sum(2))
    log_Z = log_Z + max_marg
    log_Z = T.switch(T.le(log_Z, -np.inf), 0, log_Z)
    targets = _labeling_batch_to_class_batch(blanked_y,
                                             T.exp(marginals -
                                                   log_Z[:,:, None]),
                                             num_classes + 1)
    return targets
Пример #26
0
    def dlogp(inputs, gradients):
        g_logp, = gradients
        cov, delta = inputs

        g_logp.tag.test_value = floatX(1.)
        n, k = delta.shape

        chol_cov = cholesky(cov)
        diag = tt.nlinalg.diag(chol_cov)
        ok = tt.all(diag > 0)

        chol_cov = tt.switch(ok, chol_cov, tt.fill(chol_cov, 1))
        delta_trans = solve_lower(chol_cov, delta.T).T

        inner = n * tt.eye(k) - tt.dot(delta_trans.T, delta_trans)
        g_cov = solve_upper(chol_cov.T, inner)
        g_cov = solve_upper(chol_cov.T, g_cov.T)

        tau_delta = solve_upper(chol_cov.T, delta_trans.T)
        g_delta = tau_delta.T

        g_cov = tt.switch(ok, g_cov, -np.nan)
        g_delta = tt.switch(ok, g_delta, -np.nan)

        return [-0.5 * g_cov * g_logp, -g_delta * g_logp]
 def __init__(self, rng, f='ReLU', g=lambda x: x, params=None):
     if   f == 'ReLU':
         if hasattr(T.nnet, 'relu'):
             self.f = T.nnet.relu
         else:
             self.f = lambda x: T.switch(x<0,0,x)
         self.g = lambda x: x
     elif f == 'PReLU':
         # Avoids dying ReLU units
         if hasattr(T.nnet, 'relu'):
             self.f = lambda x: T.nnet.relu(x, alpha=0.01)
         else:
             self.f = lambda x: T.switch(x<=0,a*x,x)
         self.g = lambda x: x
     elif f == 'tanh':
         self.f = T.tanh
         self.g = T.arctanh
     elif f == 'sigmoid':
         self.f = T.nnet.sigmoid
         self.g = lambda x: x
     elif f == 'softmax':
         self.f = T.nnet.softmax
         self.g = lambda x: x
     elif f == 'softplus':
         self.f = T.nnet.softplus
         self.g = lambda x: x
     elif f == 'identity':
         self.f = lambda x: x
         self.g = lambda x: x
     else:
         self.f = f
         self.g = g
     
     self.params = [] if params is None else params
Пример #28
0
def mixture_model(random_seed=1234):
    """Sample mixture model to use in benchmarks"""
    np.random.seed(1234)
    size = 1000
    w_true = np.array([0.35, 0.4, 0.25])
    mu_true = np.array([0., 2., 5.])
    sigma = np.array([0.5, 0.5, 1.])
    component = np.random.choice(mu_true.size, size=size, p=w_true)
    x = np.random.normal(mu_true[component], sigma[component], size=size)

    with pm.Model() as model:
        w = pm.Dirichlet('w', a=np.ones_like(w_true))
        mu = pm.Normal('mu', mu=0., sd=10., shape=w_true.shape)
        enforce_order = pm.Potential('enforce_order', tt.switch(mu[0] - mu[1] <= 0, 0., -np.inf) +
                                                      tt.switch(mu[1] - mu[2] <= 0, 0., -np.inf))
        tau = pm.Gamma('tau', alpha=1., beta=1., shape=w_true.shape)
        pm.NormalMixture('x_obs', w=w, mu=mu, tau=tau, observed=x)

    # Initialization can be poorly specified, this is a hack to make it work
    start = {
        'mu': mu_true.copy(),
        'tau_log__': np.log(1. / sigma**2),
        'w_stickbreaking__': np.array([-0.03,  0.44])
    }
    return model, start
Пример #29
0
def theano_digitize(x, bins):
    """
    Equivalent to numpy digitize.

    Parameters
    ----------
    x : Theano tensor or array_like
        The array or matrix to be digitized
    bins : array_like
        The bins with which x should be digitized

    Returns
    -------
    A Theano tensor
        The indices of the bins to which each value in input array belongs.
    """
    binned = T.zeros_like(x) + len(bins)
    for i in range(len(bins)):
        bin=bins[i]
        if i == 0:
            binned=T.switch(T.lt(x,bin),i,binned)
        else:
            ineq = T.and_(T.ge(x,bins[i-1]),T.lt(x,bin))
            binned=T.switch(ineq,i,binned)
    binned=T.switch(T.isnan(x), len(bins), binned)
    return binned
Пример #30
0
def tnormal_icdf(size, avg, std, lbound, ubound, theano_rng, dtype):
    """
    Alternative Method:
    sample = -Phi_inv(Phi(-lbound)*(1-u) + Phi(-ubound)*u)
    """

    def Phi(x):
        erfarg = (x - avg) / (std * SQRT2)
        rval = 0.5 * (1. + T.erf(erfarg))
        return rval.astype(dtype)
    
    def Phi_inv(y, eps=3e-8):
        """ eps was calibrated for cublas.erfinv using float32 """
        temp = 2. * y - 1.
        erfinv_input = T.clip(temp, -1+eps, 1-eps)
        rval = avg + std * SQRT2 * T.erfinv(erfinv_input)
        return rval.astype(dtype)

    # center lower and upper bounds based on mean
    u = theano_rng.uniform(size=size, dtype=dtype)

    # Inverse CDF method. When method becomes numerically unstable, we simply
    # return the bounds based on whether avg < lbound, or ubound < avg.
    cdf_range = Phi(ubound) - Phi(lbound)
    sample = T.switch(
                T.or_(
                    T.lt(cdf_range, 3e-8),
                    T.gt(cdf_range, 1-3e-8)),
                T.switch(
                    T.lt(avg, lbound),
                    lbound,
                    ubound),
                Phi_inv(Phi(lbound) + u * cdf_range))

    return sample
Пример #31
0
 def __call__(self, x):
     dropped_units = self.rng.binomial(
         n=1,
         p=self.dropout_rate,
         size=x.shape if self.shape is None else self.shape)
     return tt.switch(dropped_units, 0, x)
Пример #32
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (c_found, c_lower,
                                                       c_zeros)

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        inputs = T.concatenate(inputs,
                               axis=1) if len(inputs) != 1 else inputs[0]

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)
            tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            observations = T.concatenate([b_s, observations, e_s], axis=0)

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[padded_tags_ids[T.arange(s_len +
                                                                    1)],
                                           padded_tags_ids[T.arange(s_len + 1)
                                                           + 1]].sum()

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))

        return f_train, f_eval
Пример #33
0
def main(args):

    trial = int(args['trial'])
    pkl_name = 'vrnn_gmm_%d' % trial
    channel_name = 'valid_nll_upper_bound'

    data_path = args['data_path']
    save_path = args['save_path']

    monitoring_freq = int(args['monitoring_freq'])
    force_saving_freq = int(args['force_saving_freq'])
    reset_freq = int(args['reset_freq'])
    epoch = int(args['epoch'])
    batch_size = int(args['batch_size'])
    m_batch_size = int(args['m_batch_size'])
    x_dim = int(args['x_dim'])
    z_dim = int(args['z_dim'])
    rnn_dim = int(args['rnn_dim'])
    k = int(args['num_k'])
    lr = float(args['lr'])
    debug = int(args['debug'])

    print "trial no. %d" % trial
    print "batch size %d" % batch_size
    print "learning rate %f" % lr
    print "saving pkl file '%s'" % pkl_name
    print "to the save path '%s'" % save_path

    q_z_dim = 500
    p_z_dim = 500
    p_x_dim = 500
    x2s_dim = 500
    z2s_dim = 500
    target_dim = x_dim * k

    file_name = 'blizzard_unseg_tbptt'
    normal_params = np.load(data_path + file_name + '_normal.npz')
    X_mean = normal_params['X_mean']
    X_std = normal_params['X_std']

    model = Model()
    train_data = Blizzard_tbptt(name='train',
                                path=data_path,
                                frame_size=x_dim,
                                file_name=file_name,
                                X_mean=X_mean,
                                X_std=X_std)

    valid_data = Blizzard_tbptt(name='valid',
                                path=data_path,
                                frame_size=x_dim,
                                file_name=file_name,
                                X_mean=X_mean,
                                X_std=X_std)

    x = train_data.theano_vars()
    m_x = valid_data.theano_vars()

    if debug:
        x.tag.test_value = np.zeros((15, batch_size, x_dim),
                                    dtype=theano.config.floatX)
        m_x.tag.test_value = np.zeros((15, m_batch_size, x_dim),
                                      dtype=theano.config.floatX)

    init_W = InitCell('rand')
    init_U = InitCell('ortho')
    init_b = InitCell('zeros')
    init_b_sig = InitCell('const', mean=0.6)

    x_1 = FullyConnectedLayer(name='x_1',
                              parent=['x_t'],
                              parent_dim=[x_dim],
                              nout=x2s_dim,
                              unit='relu',
                              init_W=init_W,
                              init_b=init_b)

    x_2 = FullyConnectedLayer(name='x_2',
                              parent=['x_1'],
                              parent_dim=[x2s_dim],
                              nout=x2s_dim,
                              unit='relu',
                              init_W=init_W,
                              init_b=init_b)

    x_3 = FullyConnectedLayer(name='x_3',
                              parent=['x_2'],
                              parent_dim=[x2s_dim],
                              nout=x2s_dim,
                              unit='relu',
                              init_W=init_W,
                              init_b=init_b)

    x_4 = FullyConnectedLayer(name='x_4',
                              parent=['x_3'],
                              parent_dim=[x2s_dim],
                              nout=x2s_dim,
                              unit='relu',
                              init_W=init_W,
                              init_b=init_b)

    z_1 = FullyConnectedLayer(name='z_1',
                              parent=['z_t'],
                              parent_dim=[z_dim],
                              nout=z2s_dim,
                              unit='relu',
                              init_W=init_W,
                              init_b=init_b)

    z_2 = FullyConnectedLayer(name='z_2',
                              parent=['z_1'],
                              parent_dim=[z2s_dim],
                              nout=z2s_dim,
                              unit='relu',
                              init_W=init_W,
                              init_b=init_b)

    z_3 = FullyConnectedLayer(name='z_3',
                              parent=['z_2'],
                              parent_dim=[z2s_dim],
                              nout=z2s_dim,
                              unit='relu',
                              init_W=init_W,
                              init_b=init_b)

    z_4 = FullyConnectedLayer(name='z_4',
                              parent=['z_3'],
                              parent_dim=[z2s_dim],
                              nout=z2s_dim,
                              unit='relu',
                              init_W=init_W,
                              init_b=init_b)

    rnn = LSTM(name='rnn',
               parent=['x_4', 'z_4'],
               parent_dim=[x2s_dim, z2s_dim],
               nout=rnn_dim,
               unit='tanh',
               init_W=init_W,
               init_U=init_U,
               init_b=init_b)

    phi_1 = FullyConnectedLayer(name='phi_1',
                                parent=['x_4', 's_tm1'],
                                parent_dim=[x2s_dim, rnn_dim],
                                nout=q_z_dim,
                                unit='relu',
                                init_W=init_W,
                                init_b=init_b)

    phi_2 = FullyConnectedLayer(name='phi_2',
                                parent=['phi_1'],
                                parent_dim=[q_z_dim],
                                nout=q_z_dim,
                                unit='relu',
                                init_W=init_W,
                                init_b=init_b)

    phi_3 = FullyConnectedLayer(name='phi_3',
                                parent=['phi_2'],
                                parent_dim=[q_z_dim],
                                nout=q_z_dim,
                                unit='relu',
                                init_W=init_W,
                                init_b=init_b)

    phi_4 = FullyConnectedLayer(name='phi_4',
                                parent=['phi_3'],
                                parent_dim=[q_z_dim],
                                nout=q_z_dim,
                                unit='relu',
                                init_W=init_W,
                                init_b=init_b)

    phi_mu = FullyConnectedLayer(name='phi_mu',
                                 parent=['phi_4'],
                                 parent_dim=[q_z_dim],
                                 nout=z_dim,
                                 unit='linear',
                                 init_W=init_W,
                                 init_b=init_b)

    phi_sig = FullyConnectedLayer(name='phi_sig',
                                  parent=['phi_4'],
                                  parent_dim=[q_z_dim],
                                  nout=z_dim,
                                  unit='softplus',
                                  cons=1e-4,
                                  init_W=init_W,
                                  init_b=init_b_sig)

    prior_1 = FullyConnectedLayer(name='prior_1',
                                  parent=['s_tm1'],
                                  parent_dim=[rnn_dim],
                                  nout=p_z_dim,
                                  unit='relu',
                                  init_W=init_W,
                                  init_b=init_b)

    prior_2 = FullyConnectedLayer(name='prior_2',
                                  parent=['prior_1'],
                                  parent_dim=[p_z_dim],
                                  nout=p_z_dim,
                                  unit='relu',
                                  init_W=init_W,
                                  init_b=init_b)

    prior_3 = FullyConnectedLayer(name='prior_3',
                                  parent=['prior_2'],
                                  parent_dim=[p_z_dim],
                                  nout=p_z_dim,
                                  unit='relu',
                                  init_W=init_W,
                                  init_b=init_b)

    prior_4 = FullyConnectedLayer(name='prior_4',
                                  parent=['prior_3'],
                                  parent_dim=[p_z_dim],
                                  nout=p_z_dim,
                                  unit='relu',
                                  init_W=init_W,
                                  init_b=init_b)

    prior_mu = FullyConnectedLayer(name='prior_mu',
                                   parent=['prior_4'],
                                   parent_dim=[p_z_dim],
                                   nout=z_dim,
                                   unit='linear',
                                   init_W=init_W,
                                   init_b=init_b)

    prior_sig = FullyConnectedLayer(name='prior_sig',
                                    parent=['prior_4'],
                                    parent_dim=[p_z_dim],
                                    nout=z_dim,
                                    unit='softplus',
                                    cons=1e-4,
                                    init_W=init_W,
                                    init_b=init_b_sig)

    theta_1 = FullyConnectedLayer(name='theta_1',
                                  parent=['z_4', 's_tm1'],
                                  parent_dim=[z2s_dim, rnn_dim],
                                  nout=p_x_dim,
                                  unit='relu',
                                  init_W=init_W,
                                  init_b=init_b)

    theta_2 = FullyConnectedLayer(name='theta_2',
                                  parent=['theta_1'],
                                  parent_dim=[p_x_dim],
                                  nout=p_x_dim,
                                  unit='relu',
                                  init_W=init_W,
                                  init_b=init_b)

    theta_3 = FullyConnectedLayer(name='theta_3',
                                  parent=['theta_2'],
                                  parent_dim=[p_x_dim],
                                  nout=p_x_dim,
                                  unit='relu',
                                  init_W=init_W,
                                  init_b=init_b)

    theta_4 = FullyConnectedLayer(name='theta_4',
                                  parent=['theta_3'],
                                  parent_dim=[p_x_dim],
                                  nout=p_x_dim,
                                  unit='relu',
                                  init_W=init_W,
                                  init_b=init_b)

    theta_mu = FullyConnectedLayer(name='theta_mu',
                                   parent=['theta_4'],
                                   parent_dim=[p_x_dim],
                                   nout=target_dim,
                                   unit='linear',
                                   init_W=init_W,
                                   init_b=init_b)

    theta_sig = FullyConnectedLayer(name='theta_sig',
                                    parent=['theta_4'],
                                    parent_dim=[p_x_dim],
                                    nout=target_dim,
                                    unit='softplus',
                                    cons=1e-4,
                                    init_W=init_W,
                                    init_b=init_b_sig)

    coeff = FullyConnectedLayer(name='coeff',
                                parent=['theta_4'],
                                parent_dim=[p_x_dim],
                                nout=k,
                                unit='softmax',
                                init_W=init_W,
                                init_b=init_b)

    nodes = [
        rnn, x_1, x_2, x_3, x_4, z_1, z_2, z_3, z_4, phi_1, phi_2, phi_3,
        phi_4, phi_mu, phi_sig, prior_1, prior_2, prior_3, prior_4, prior_mu,
        prior_sig, theta_1, theta_2, theta_3, theta_4, theta_mu, theta_sig,
        coeff
    ]

    params = OrderedDict()
    for node in nodes:
        if node.initialize() is not None:
            params.update(node.initialize())
    params = init_tparams(params)

    step_count = sharedX(0, name='step_count')
    last_rnn = np.zeros((batch_size, rnn_dim * 2), dtype=theano.config.floatX)
    rnn_tm1 = sharedX(last_rnn, name='rnn_tm1')
    shared_updates = OrderedDict()
    shared_updates[step_count] = step_count + 1

    s_0 = T.switch(T.eq(T.mod(step_count, reset_freq), 0),
                   rnn.get_init_state(batch_size), rnn_tm1)

    x_shape = x.shape
    x_in = x.reshape((x_shape[0] * x_shape[1], -1))
    x_1_in = x_1.fprop([x_in], params)
    x_2_in = x_2.fprop([x_1_in], params)
    x_3_in = x_3.fprop([x_2_in], params)
    x_4_in = x_4.fprop([x_3_in], params)
    x_4_in = x_4_in.reshape((x_shape[0], x_shape[1], -1))

    def inner_fn(x_t, s_tm1):

        phi_1_t = phi_1.fprop([x_t, s_tm1], params)
        phi_2_t = phi_2.fprop([phi_1_t], params)
        phi_3_t = phi_3.fprop([phi_2_t], params)
        phi_4_t = phi_4.fprop([phi_3_t], params)
        phi_mu_t = phi_mu.fprop([phi_4_t], params)
        phi_sig_t = phi_sig.fprop([phi_4_t], params)

        prior_1_t = prior_1.fprop([s_tm1], params)
        prior_2_t = prior_2.fprop([prior_1_t], params)
        prior_3_t = prior_3.fprop([prior_2_t], params)
        prior_4_t = prior_4.fprop([prior_3_t], params)
        prior_mu_t = prior_mu.fprop([prior_4_t], params)
        prior_sig_t = prior_sig.fprop([prior_4_t], params)

        z_t = Gaussian_sample(phi_mu_t, phi_sig_t)

        z_1_t = z_1.fprop([z_t], params)
        z_2_t = z_2.fprop([z_1_t], params)
        z_3_t = z_3.fprop([z_2_t], params)
        z_4_t = z_4.fprop([z_3_t], params)

        s_t = rnn.fprop([[x_t, z_4_t], [s_tm1]], params)

        return s_t, phi_mu_t, phi_sig_t, prior_mu_t, prior_sig_t, z_4_t

    ((s_temp, phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp, z_4_temp), updates) =\
        theano.scan(fn=inner_fn,
                    sequences=[x_4_in],
                    outputs_info=[s_0, None, None, None, None, None])

    for k, v in updates.iteritems():
        k.default_update = v

    shared_updates[rnn_tm1] = s_temp[-1]
    s_temp = concatenate([s_0[None, :, :], s_temp[:-1]], axis=0)
    theta_1_temp = theta_1.fprop([z_4_temp, s_temp], params)
    theta_2_temp = theta_2.fprop([theta_1_temp], params)
    theta_3_temp = theta_3.fprop([theta_2_temp], params)
    theta_4_temp = theta_4.fprop([theta_3_temp], params)
    theta_mu_temp = theta_mu.fprop([theta_4_temp], params)
    theta_sig_temp = theta_sig.fprop([theta_4_temp], params)
    coeff_temp = coeff.fprop([theta_4_temp], params)

    kl_temp = KLGaussianGaussian(phi_mu_temp, phi_sig_temp, prior_mu_temp,
                                 prior_sig_temp)

    x_shape = x.shape
    x_in = x.reshape((x_shape[0] * x_shape[1], -1))
    theta_mu_in = theta_mu_temp.reshape((x_shape[0] * x_shape[1], -1))
    theta_sig_in = theta_sig_temp.reshape((x_shape[0] * x_shape[1], -1))
    coeff_in = coeff_temp.reshape((x_shape[0] * x_shape[1], -1))

    recon = GMM(x_in, theta_mu_in, theta_sig_in, coeff_in)
    recon_term = recon.mean()
    kl_term = kl_temp.mean()
    nll_upper_bound = recon_term + kl_term
    nll_upper_bound.name = 'nll_upper_bound'

    m_x_1_temp = x_1.fprop([m_x], params)
    m_x_2_temp = x_2.fprop([m_x_1_temp], params)
    m_x_3_temp = x_3.fprop([m_x_2_temp], params)
    m_x_4_temp = x_4.fprop([m_x_3_temp], params)

    m_s_0 = rnn.get_init_state(m_batch_size)

    ((m_s_temp, m_phi_mu_temp, m_phi_sig_temp, m_prior_mu_temp, m_prior_sig_temp, m_z_4_temp), m_updates) =\
        theano.scan(fn=inner_fn,
                    sequences=[m_x_4_temp],
                    outputs_info=[m_s_0, None, None, None, None, None])

    for k, v in m_updates.iteritems():
        k.default_update = v

    m_s_temp = concatenate([m_s_0[None, :, :], m_s_temp[:-1]], axis=0)
    m_theta_1_temp = theta_1.fprop([m_z_4_temp, m_s_temp], params)
    m_theta_2_temp = theta_2.fprop([m_theta_1_temp], params)
    m_theta_3_temp = theta_3.fprop([m_theta_2_temp], params)
    m_theta_4_temp = theta_4.fprop([m_theta_3_temp], params)
    m_theta_mu_temp = theta_mu.fprop([m_theta_4_temp], params)
    m_theta_sig_temp = theta_sig.fprop([m_theta_4_temp], params)
    m_coeff_temp = coeff.fprop([m_theta_4_temp], params)

    m_kl_temp = KLGaussianGaussian(m_phi_mu_temp, m_phi_sig_temp,
                                   m_prior_mu_temp, m_prior_sig_temp)

    m_x_shape = m_x.shape
    m_x_in = m_x.reshape((m_x_shape[0] * m_x_shape[1], -1))
    m_theta_mu_in = m_theta_mu_temp.reshape((m_x_shape[0] * m_x_shape[1], -1))
    m_theta_sig_in = m_theta_sig_temp.reshape(
        (m_x_shape[0] * m_x_shape[1], -1))
    m_coeff_in = m_coeff_temp.reshape((m_x_shape[0] * m_x_shape[1], -1))

    m_recon = GMM(m_x_in, m_theta_mu_in, m_theta_sig_in, m_coeff_in)
    m_recon_term = m_recon.mean()
    m_kl_term = m_kl_temp.mean()
    m_nll_upper_bound = m_recon_term + m_kl_term
    m_nll_upper_bound.name = 'nll_upper_bound'
    m_recon_term.name = 'recon_term'
    m_kl_term.name = 'kl_term'

    max_x = m_x.max()
    mean_x = m_x.mean()
    min_x = m_x.min()
    max_x.name = 'max_x'
    mean_x.name = 'mean_x'
    min_x.name = 'min_x'

    max_theta_mu = m_theta_mu_in.max()
    mean_theta_mu = m_theta_mu_in.mean()
    min_theta_mu = m_theta_mu_in.min()
    max_theta_mu.name = 'max_theta_mu'
    mean_theta_mu.name = 'mean_theta_mu'
    min_theta_mu.name = 'min_theta_mu'

    max_theta_sig = m_theta_sig_in.max()
    mean_theta_sig = m_theta_sig_in.mean()
    min_theta_sig = m_theta_sig_in.min()
    max_theta_sig.name = 'max_theta_sig'
    mean_theta_sig.name = 'mean_theta_sig'
    min_theta_sig.name = 'min_theta_sig'

    max_phi_sig = m_phi_sig_temp.max()
    mean_phi_sig = m_phi_sig_temp.mean()
    min_phi_sig = m_phi_sig_temp.min()
    max_phi_sig.name = 'max_phi_sig'
    mean_phi_sig.name = 'mean_phi_sig'
    min_phi_sig.name = 'min_phi_sig'

    max_prior_sig = m_prior_sig_temp.max()
    mean_prior_sig = m_prior_sig_temp.mean()
    min_prior_sig = m_prior_sig_temp.min()
    max_prior_sig.name = 'max_prior_sig'
    mean_prior_sig.name = 'mean_prior_sig'
    min_prior_sig.name = 'min_prior_sig'

    model.inputs = [x]
    model.params = params
    model.nodes = nodes
    model.set_updates(shared_updates)

    optimizer = Adam(lr=lr)

    monitor_fn = theano.function(
        inputs=[m_x],
        outputs=[
            m_nll_upper_bound, m_recon_term, m_kl_term, max_phi_sig,
            mean_phi_sig, min_phi_sig, max_prior_sig, mean_prior_sig,
            min_prior_sig, max_theta_sig, mean_theta_sig, min_theta_sig, max_x,
            mean_x, min_x, max_theta_mu, mean_theta_mu, min_theta_mu
        ],
        on_unused_input='ignore')

    extension = [
        GradientClipping(batch_size=batch_size, check_nan=1),
        EpochCount(epoch),
        Monitoring(freq=monitoring_freq,
                   monitor_fn=monitor_fn,
                   ddout=[
                       m_nll_upper_bound, m_recon_term, m_kl_term, max_phi_sig,
                       mean_phi_sig, min_phi_sig, max_prior_sig,
                       mean_prior_sig, min_prior_sig, max_theta_sig,
                       mean_theta_sig, min_theta_sig, max_x, mean_x, min_x,
                       max_theta_mu, mean_theta_mu, min_theta_mu
                   ],
                   data=[
                       Iterator(train_data, m_batch_size, start=0, end=112640),
                       Iterator(valid_data,
                                m_batch_size,
                                start=2040064,
                                end=2152704)
                   ]),
        Picklize(freq=monitoring_freq,
                 force_save_freq=force_saving_freq,
                 path=save_path),
        EarlyStopping(freq=monitoring_freq,
                      force_save_freq=force_saving_freq,
                      path=save_path,
                      channel=channel_name),
        WeightNorm()
    ]

    mainloop = Training(name=pkl_name,
                        data=Iterator(train_data,
                                      batch_size,
                                      start=0,
                                      end=2040064),
                        model=model,
                        optimizer=optimizer,
                        cost=nll_upper_bound,
                        outputs=[nll_upper_bound],
                        extension=extension)
    mainloop.run()
Пример #34
0
def clip_norm(g, c, n):
    if c > 0:
        g = T.switch(T.ge(n, c), g * c / n, g)
    return g
Пример #35
0
def elu(z):  # https://arxiv.org/pdf/1511.07289v1.pdf
    return T.switch(T.ge(z, 0), z, T.exp(z) - 1)
Пример #36
0
 def step_rnn(x_t, mask, h_tm1, W, h0):
     h_tm1 = T.switch(mask, h0, h_tm1)
     return [ACTIVATION[Ah](x_t + h_tm1.dot(W))]
Пример #37
0
def logpow(x, m):
    """
    Calculates log(x**m) since m*log(x) will fail when m, x = 0.
    """
    return switch(eq(x, 0) & eq(m, 0), 0, m * log(x))
Пример #38
0
 def apply(self, input_):
     res = T.switch(input_ > 0, input_, self.leaky_init * input_)
     return T.switch(T.isnan(res), 0, res)
Пример #39
0
 def leakly_relu(x):
     return T.switch(T.gt(x, 0.), x, x * np.float32(0.2))
Пример #40
0
    def __init__(self, rng, input, is_train, n_in, n_hidden, n_out, p=0.5, dropout=False, input_p=0.1): #, batch_size=20):

        #Need input dropout layer
        if input_p!=None:
            self.input_layer = drop(input, rng=rng, p=input_p)
            self.input_layer = T.switch(T.neq(is_train, 0), self.input_layer, input)
        else:
            self.input_layer=input

        param_to_scale = [] #To scale weights to square length of 15

        self.layer_0 = HiddenLayer(
            rng=rng,
            input=self.input_layer,
            n_in=n_in,
            n_out=n_hidden[0],
            activation=prelu,
            is_train=is_train,
            p=p,
            dropout=dropout
        )

        self.params = self.layer_0.params
        param_to_scale = param_to_scale + [self.layer_0.params[0]]

        #Add more layers accordingly
        layer_number = 1
        if len(n_hidden)>1:

            for layer in n_hidden[1:]:

                current_hidden_layer = HiddenLayer(
                                                    rng=rng,
                                                    input=getattr(self, "layer_" + str(layer_number-1)).output,
                                                    n_in=n_hidden[layer_number-1],
                                                    n_out=n_hidden[layer_number],
                                                    activation=prelu,
                                                    is_train=is_train,
                                                    p=p,
                                                    dropout=dropout
                                                )

                setattr(self, "layer_" + str(layer_number), current_hidden_layer)

                self.params = self.params + getattr(self, "layer_" + str(layer_number)).params

                param_to_scale = param_to_scale + [getattr(self, "layer_" + str(layer_number)).params[0]]

                layer_number = layer_number + 1


        # The logistic regression layer gets as input the hidden units
        # of the hidden layer

        self.linearRegressionLayer = LinearRegression(
            input=getattr(self, "layer_" + str(layer_number-1)).output,
            n_in=n_hidden[layer_number-1],
            n_out=n_out,
            rng=rng #,batch_size=batch_size
        )
        self.params = self.params + self.linearRegressionLayer.params

        #L1 and L2 regularization
        self.L1 = (
            abs(self.layer_0.W).sum() + abs(self.linearRegressionLayer.W).sum()
        )

        self.L2_sqr = (
            (self.layer_0.W ** 2).sum() + (self.linearRegressionLayer.W ** 2).sum()
        )
        #

        # self.negative_log_likelihood = (
        #     self.logRegressionLayer.negative_log_likelihood
        # )
        #
        # self.errors = self.logRegressionLayer.errors
        # self.pred = self.logRegressionLayer.pred
        # self.diff = self.logRegressionLayer.diff
        self.param_to_scale = param_to_scale
        self.errors = self.linearRegressionLayer.errors
        self.loss = self.linearRegressionLayer.loss
        self.NRMSE = self.linearRegressionLayer.NRMSE
        self.pred = self.linearRegressionLayer.pred

        self.input = input #KEEP IN MIND THIS IS DIFFERENT THAN self.input_layer!!!
Пример #41
0
def build_sampler(tparams, options, trng):
    x = tensor.matrix('x', dtype='int64')
    xr = x[::-1]
    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # word embedding (source), forward and backward
    emb = tparams['Wemb'][x.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
    embr = tparams['Wemb'][xr.flatten()]
    embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])

    # encoder
    proj = get_layer(options['encoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='encoder')
    projr = get_layer(options['encoder'])[1](tparams,
                                             embr,
                                             options,
                                             prefix='encoder_r')

    # concatenate forward and backward rnn hidden states
    ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1)

    # get the input for decoder rnn initializer mlp
    ctx_mean = ctx.mean(0)
    # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2)
    init_state = get_layer('ff')[1](tparams,
                                    ctx_mean,
                                    options,
                                    prefix='ff_state',
                                    activ='tanh')

    print 'Building f_init...',
    outs = [init_state, ctx]
    f_init = theano.function([x], outs, name='f_init', profile=profile)
    print 'Done'

    # x: 1 x 1
    y = tensor.vector('y_sampler', dtype='int64')
    init_state = tensor.matrix('init_state', dtype='float32')

    # if it's the first word, emb should be all zero and it is indicated by -1
    emb = tensor.switch(y[:, None] < 0,
                        tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
                        tparams['Wemb_dec'][y])

    # apply one step of conditional gru with attention
    proj = get_layer(options['decoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='decoder',
                                            mask=None,
                                            context=ctx,
                                            one_step=True,
                                            init_state=init_state)
    # get the next hidden state
    next_state = proj[0]

    # get the weighted averages of context for this target word y
    ctxs = proj[1]

    logit_lstm = get_layer('ff')[1](tparams,
                                    next_state,
                                    options,
                                    prefix='ff_logit_lstm',
                                    activ='linear')
    logit_prev = get_layer('ff')[1](tparams,
                                    emb,
                                    options,
                                    prefix='ff_logit_prev',
                                    activ='linear')
    logit_ctx = get_layer('ff')[1](tparams,
                                   ctxs,
                                   options,
                                   prefix='ff_logit_ctx',
                                   activ='linear')
    logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx)
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')

    # compute the softmax probability
    next_probs = tensor.nnet.softmax(logit)

    # sample from softmax distribution to get the sample
    next_sample = trng.multinomial(pvals=next_probs).argmax(1)

    # compile a function to do the whole thing above, next word probability,
    # sampled word for the next target, next hidden state to be used
    print 'Building f_next..',
    inps = [y, ctx, init_state]
    outs = [next_probs, next_sample, next_state]
    f_next = theano.function(inps, outs, name='f_next', profile=profile)
    print 'Done'

    return f_init, f_next
Пример #42
0
 def apply(self, input_):
     input_ = T.switch(T.isnan(input_), 0, input_)
     return T.switch(input_ > 0, input_, 0.01 * input_)
    def __init__(self, is_train, rng, input=1, n_in=1, n_out = 500,W=None, b=None,
                 activation=T.tanh, p=0.5):
        # type: (object, object, object, object, object, object, object, object, object) -> object
        """
        Hidden unit activation is given by: activation(dot(input,W) + b)

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type is_train: theano.iscalar
        :param is_train: indicator pseudo-boolean (int) for switching between training and prediction

        :type input: theano.tensor.dmatrix
        :param input: a symbolic tensor of shape (n_examples, n_in)

        :type n_in: int
        :param n_in: dimensionality of input

        :type n_out: int
        :param n_out: number of hidden units

        :type activation: theano.Op or function
        :param activation: Non linearity to be applied in the hidden
                           layer

        :type p: float or double
        :param p: probability of NOT dropping out a unit
        """
        self.input = input

        if W is None:
            W_values = numpy.asarray(
                rng.uniform(
                    low=-numpy.sqrt(6. / (n_in + n_out)),
                     high=numpy.sqrt(6. / (n_in + n_out)),
                    size=(n_in, n_out)
                ),
                dtype=theano.config.floatX
            )
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4

            W = theano.shared(value=W_values, name='W', borrow=True)

        if b is None:
            b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
            b = theano.shared(value=b_values, name='b', borrow=True)


        self.W = W
        self.b = b

        lin_output = T.dot(input, self.W) + self.b

        output = activation(lin_output)

        # multiply output and drop -> in an approximation the scaling effects cancel out
        train_output = drop(output,p)

        #is_train is a pseudo boolean theano variable for switching between training and prediction
        self.output = T.switch(T.neq(is_train, 0), train_output, p*output)

        # parameters of the model
        self.params = [self.W, self.b]
Пример #44
0
 def relu(x):
     return T.switch(T.gt(x, 0.), x, 0.)
Пример #45
0
def hinge_c(x, y):
    return T.switch(T.lt(1 - x * y, 0), 0 * x, 1 - x * y)
Пример #46
0
def train(
        dim_word=100,  # word vector dimensionality
        dim=1000,  # the number of LSTM units
        encoder='gru',
        decoder='gru_cond',
        patience=10,  # early stopping patience
        max_epochs=5000,
        finish_after=10000000,  # finish after this many updates
        dispFreq=100,
        decay_c=0.,  # L2 regularization penalty
        alpha_c=0.,  # alignment regularization
        clip_c=-1.,  # gradient clipping threshold
        lrate=0.01,  # learning rate
        n_words_src=100000,  # source vocabulary size
        n_words=100000,  # target vocabulary size
        maxlen=100,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        sampleFreq=100,  # generate some samples after every sampleFreq
        datasets=[
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'
        ],
        valid_datasets=[
            '../data/dev/newstest2011.en.tok',
            '../data/dev/newstest2011.fr.tok'
        ],
        dictionaries=[
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'
        ],
        use_dropout=False,
        reload_=False):

    # Model options
    model_options = locals().copy()

    # load dictionaries and invert them
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    for ii, dd in enumerate(dictionaries):
        with open(dd, 'rb') as f:
            worddicts[ii] = pkl.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        with open('%s.pkl' % saveto, 'rb') as f:
            models_options = pkl.load(f)

    print 'Loading data'
    train = TextIterator(datasets[0],
                         datasets[1],
                         dictionaries[0],
                         dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words,
                         batch_size=batch_size,
                         maxlen=maxlen)
    valid = TextIterator(valid_datasets[0],
                         valid_datasets[1],
                         dictionaries[0],
                         dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        params = load_params(saveto, params)

    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    print 'Buliding sampler'
    f_init, f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
    print 'Done'

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = list(numpy.load(saveto)['history_errs'])
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0]) / batch_size

    uidx = 0
    estop = False
    for eidx in xrange(max_epochs):
        n_samples = 0

        for x, y in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            x, x_mask, y, y_mask = prepare_data(x,
                                                y,
                                                maxlen=maxlen,
                                                n_words_src=n_words_src,
                                                n_words=n_words)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask)

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            # save the best model so far
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

            # generate some samples with the model and display them
            if numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(numpy.minimum(5, x.shape[1])):
                    stochastic = True
                    sample, score = gen_sample(tparams,
                                               f_init,
                                               f_next,
                                               x[:, jj][:, None],
                                               model_options,
                                               trng=trng,
                                               k=1,
                                               maxlen=30,
                                               stochastic=stochastic,
                                               argmax=False)
                    print 'Source ', jj, ': ',
                    for vv in x[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[0]:
                            print worddicts_r[0][vv],
                        else:
                            print 'UNK',
                    print
                    print 'Truth ', jj, ' : ',
                    for vv in y[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            print worddicts_r[1][vv],
                        else:
                            print 'UNK',
                    print
                    print 'Sample ', jj, ': ',
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            print worddicts_r[1][vv],
                        else:
                            print 'UNK',
                    print

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print 'Valid ', valid_err

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = pred_probs(f_log_probs, prepare_data, model_options,
                           valid).mean()

    print 'Valid ', valid_err

    params = copy.copy(best_p)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err
Пример #47
0
def quantization(W,Wacc,method, Wb):
	
	if method == "FPN":
		Wb = W
	
	elif method == "LAB":
		L = (T.sqrt(Wacc) + 1e-8) 
		Wb = hard_sigmoid(W)
		Wb = round3(Wb)
		Wb = T.cast(T.switch(Wb,1.,-1.), theano.config.floatX) 

		alpha  = (T.abs_(L*W).sum()/L.sum()).astype('float32') 
		Wb = alpha*Wb	

	elif method=="LATa":
		D = (T.sqrt(Wacc) + 1e-8) 
		b = T.sgn(Wb)
		# compute the threshold, converge within 10 iterations 
		alpha  = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') 
		b = T.switch(T.gt(W/alpha, 0.5), 1., T.switch(T.lt(W/alpha, -0.5), -1., 0.) )
		def OneStep(alpha, b):
			# minimize alpha
			alpha_new  = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') 
			# minimize b
			b_new = T.switch(T.gt(W/alpha_new, 0.5), 1., T.switch(T.lt(W/alpha_new, -0.5), -1., 0.))
			delta = T.abs_(alpha_new-alpha)
			condition = T.lt(delta, 1e-6)
			return [alpha_new, b_new], theano.scan_module.until(condition)

		[out1, out2], updates = theano.scan(fn=OneStep ,outputs_info=[alpha, b],n_steps=10) 
		Wb  = out1[-1]*out2[-1]

	elif method=="LATe":
		D = (T.sqrt(Wacc) + 1e-8) 
		thres = findalpha(D, W)
		alpha = thres*2
		Wt = T.switch(T.gt(W, thres), 1., T.switch(T.lt(W, -thres), -1., 0.) )
		Wb = alpha*Wt

	elif method=="LAT2e":
		D = (T.sqrt(Wacc) + 1e-8) 
		thres1, thres2 = findalpha2(D, W)
		alpha1 = thres1*2
		Wt1 = T.switch(T.gt(W, thres1), 1., 0.) 
		alpha2 = thres2*2
		Wt2 = T.switch(T.lt(W, -thres2), -1., 0.) 

		Wb = alpha1*Wt1 + alpha2*Wt2	

	elif method=="LAT2a":
		D = (T.sqrt(Wacc) + 1e-8) 
		b1 = T.ge(Wb,0)
		alpha1 = (T.abs_(b1*D*W).sum()/T.abs_(b1*D).sum()).astype('float32') 
		b1 = T.switch(T.gt(W/alpha1, 0.5), 1., 0.)
		# Wb1 = alpha1*mask1*Wb
		b2 =  T.lt(Wb,0)
		alpha2 = (T.abs_(b2*D*W).sum()/T.abs_(b2*D).sum()).astype('float32') 
		b2 = T.switch(T.lt(W/alpha2, -0.5), -1., 0.)
		def OneStep(alpha1, b1, alpha2, b2):
			alpha1_new  = (T.abs_(b1*D*W).sum()/T.abs_(b1*D).sum()).astype('float32') 
			b1_new = T.switch(T.gt(W/alpha1_new, 0.5), 1., 0.)
			alpha2_new = (T.abs_(b2*D*W).sum()/T.abs_(b2*D).sum()).astype('float32') 
			b2_new = T.switch(T.lt(W/alpha2_new, -0.5), -1., 0.)

			delta1 = T.abs_(alpha1_new-alpha1)
			delta2 = T.abs_(alpha2_new-alpha2)
			condition = T.lt(delta1, 1e-6) and T.lt(delta2, 1e-6)
			return [alpha1_new, b1_new, alpha2_new, b2_new], theano.scan_module.until(condition)

		[out1, out2, out3, out4], updates = theano.scan(fn=OneStep ,outputs_info=[alpha1, b1, alpha2, b2],n_steps=10)
		Wb  = out1[-1]*out2[-1] + out3[-1]*out4[-1]	
								  

	elif method=="LAQ_linear":
		D = (T.sqrt(Wacc) + 1e-8) 
		b = T.sgn(Wb)
		alpha  = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') 
		# b = T.switch(T.gt(W/alpha, 0.5), 1., T.switch(T.lt(W/alpha, -0.5), -1., 0.) )
		m = 3 # number of bits
		n = 2**(m-1)-1

		b = round3(T.clip(W/alpha, -1., 1.)*n)/(n)
		def OneStep(alpha, b):
			# minimize alpha
			alpha_new  = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') 
			# minimize b
			# b_new = T.switch(T.gt(W/alpha, 0.5), 1., T.switch(T.lt(W/alpha, -0.5), -1., 0.))
			b_new = round3(T.clip(W/alpha_new, -1., 1.)*n)/(n)
			delta = T.abs_(alpha_new-alpha)
			condition = T.lt(delta, 1e-6)
			return [alpha_new, b_new], theano.scan_module.until(condition)

		[out1, out2], updates = theano.scan(fn=OneStep ,outputs_info=[alpha, b],n_steps=10)
		Wb  = out1[-1]*out2[-1]		

	elif method=="LAQ_log":
		D = (T.sqrt(Wacc) + 1e-8) 
		b = T.sgn(Wb)
		alpha  = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') 
		m = 3  # number of bits
		n = 2**(m-1)-1
		tmp = T.clip(W/alpha, -1., 1.)
		# log2(1/2*(2^(-n)+2^(-(n+1)))) - (-n-(n+1))/2 = 0.0849625
		b =  T.switch( T.ge(tmp, pow(2, -n)), T.pow(2, round3(T.log2(tmp)-0.0849625)), 
			T.switch( T.le(tmp, -pow(2,-n)), -T.pow(2, round3(T.log2(-tmp)-0.0849625)), 0.))
		b = T.switch(T.ge(b, pow(2, - (n-1))), b, T.switch(T.le(b, -pow(2, -(n-1))), b, T.sgn(b)*pow(2,-(n-1))))

		def OneStep(alpha, b):
			# minimize alpha
			alpha_new  = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') 
			# minimize b
			tmp_new = T.clip(W/alpha_new, -1., 1.)
			b_new =  T.switch( T.ge(tmp_new, pow(2, -n)), T.pow(2, round3(T.log2(tmp_new)-0.0849625)), 
				T.switch( T.le(tmp_new, -pow(2, -n)), -T.pow(2, round3(T.log2(-tmp_new)-0.0849625)), 0.))		
			b_new = T.switch(T.ge(b_new, pow(2, - (n-1))), b_new, 
				T.switch(T.le(b_new, -pow(2, -(n-1))), b_new, T.sgn(b_new)*pow(2, -(n-1))))
		
			delta = T.abs_(alpha_new-alpha)
			condition = T.lt(delta, 1e-6)
			return [alpha_new, b_new], theano.scan_module.until(condition)

		[out1, out2], updates = theano.scan(fn=OneStep ,outputs_info=[alpha, b],n_steps=10)
		Wb  = out1[-1]*out2[-1]	

	return Wb
Пример #48
0
def hmc_updates(positions, stepsize, avg_acceptance_rate, final_pos, accept,
                target_acceptance_rate, stepsize_inc, stepsize_dec,
                stepsize_min, stepsize_max, avg_acceptance_slowness):
    """This function is executed after `n_steps` of HMC sampling
    (`hmc_move` function). It creates the updates dictionary used by
    the `simulate` function. It takes care of updating: the position
    (if the move is accepted), the stepsize (to track a given target
    acceptance rate) and the average acceptance rate (computed as a
    moving average).

    Parameters
    ----------
    positions: shared variable, theano matrix
        Shared theano matrix whose rows contain the old position
    stepsize: shared variable, theano scalar
        Shared theano scalar containing current step size
    avg_acceptance_rate: shared variable, theano scalar
        Shared theano scalar containing the current average acceptance rate
    final_pos: shared variable, theano matrix
        Shared theano matrix whose rows contain the new position
    accept: theano scalar
        Boolean-type variable representing whether or not the proposed HMC move
        should be accepted or not.
    target_acceptance_rate: float
        The stepsize is modified in order to track this target acceptance rate.
    stepsize_inc: float
        Amount by which to increment stepsize when acceptance rate is too high.
    stepsize_dec: float
        Amount by which to decrement stepsize when acceptance rate is too low.
    stepsize_min: float
        Lower-bound on `stepsize`.
    stepsize_min: float
        Upper-bound on `stepsize`.
    avg_acceptance_slowness: float
        Average acceptance rate is computed as an exponential moving average.
        (1-avg_acceptance_slowness) is the weight given to the newest
        observation.

    Returns
    -------
    rval1: dictionary-like
        A dictionary of updates to be used by the `HMC_Sampler.simulate`
        function.  The updates target the position, stepsize and average
        acceptance rate.

    """

    ## POSITION UPDATES ##
    # broadcast `accept` scalar to tensor with the same dimensions as
    # final_pos.
    accept_matrix = accept.dimshuffle(0, *(('x', ) * (final_pos.ndim - 1)))
    # if accept is True, update to `final_pos` else stay put
    new_positions = TT.switch(accept_matrix, final_pos, positions)
    # end-snippet-5 start-snippet-7
    ## STEPSIZE UPDATES ##
    # if acceptance rate is too low, our sampler is too "noisy" and we reduce
    # the stepsize. If it is too high, our sampler is too conservative, we can
    # get away with a larger stepsize (resulting in better mixing).
    _new_stepsize = TT.switch(avg_acceptance_rate > target_acceptance_rate,
                              stepsize * stepsize_inc, stepsize * stepsize_dec)
    # maintain stepsize in [stepsize_min, stepsize_max]
    new_stepsize = TT.clip(_new_stepsize, stepsize_min, stepsize_max)

    # end-snippet-7 start-snippet-6
    ## ACCEPT RATE UPDATES ##
    # perform exponential moving average
    mean_dtype = theano.scalar.upcast(accept.dtype, avg_acceptance_rate.dtype)
    new_acceptance_rate = TT.add(avg_acceptance_slowness * avg_acceptance_rate,
                                 (1.0 - avg_acceptance_slowness) *
                                 accept.mean(dtype=mean_dtype))
    # end-snippet-6 start-snippet-8
    return [(positions, new_positions), (stepsize, new_stepsize),
            (avg_acceptance_rate, new_acceptance_rate)]
Пример #49
0
def relu(x):
    # Using T.nnet.relu gives me NaNs. No idea why.
    return T.switch(x > lib.floatX(0), x, lib.floatX(0))
Пример #50
0
    def updates(self):
        ups = {}
        add_updates = lambda b: safe_update(ups, b)

        base_lr = numpy.asarray(
            self.conf['base_lr_per_example'] / self.conf['batchsize'], floatX)
        annealing_coef = clip_ramp(self.iter,
                                   (self.conf['lr_anneal_start'], base_lr),
                                   (self.conf['lr_anneal_end'], 0.0),
                                   dtype=floatX)

        ups[self.iter] = self.iter + 1
        ups[self.annealing_coef] = annealing_coef

        #
        # Enforcing Sparsity
        #
        pos_h = self.rbm.mean_h_given_v(self.visible_batch)
        sparsity_cost = 0
        KL_eps = 1e-4
        if self.conf['sparsity_KL_featuretarget_weight']:
            p = self.conf['sparsity_KL_featuretarget_target']
            sparsity_cost = sparsity_cost + tensor.mul(
                self.conf['sparsity_KL_featuretarget_weight'], -tensor.sum(
                    p * tensor.log(tensor.mean(pos_h, axis=0) + KL_eps) +
                    (1 - p) *
                    tensor.log(1 - tensor.mean(pos_h, axis=0) + KL_eps)))
            assert sparsity_cost.ndim == 0
            assert sparsity_cost.dtype == 'float32'

        if self.conf['sparsity_KL_exampletarget_weight']:
            p = self.conf['sparsity_KL_exampletarget_target']
            sparsity_cost = sparsity_cost + tensor.mul(
                self.conf['sparsity_KL_exampletarget_weight'], -tensor.sum(
                    p * tensor.log(tensor.mean(pos_h, axis=1) + KL_eps) +
                    (1 - p) *
                    tensor.log(1 - tensor.mean(pos_h, axis=1) + KL_eps)))
            assert sparsity_cost.ndim == 0
            assert sparsity_cost.dtype == 'float32'

        #
        # Updates related to CD
        #
        # These updates are for CD-1, PCD/SML, and a stochastic interpolation
        # between them.
        #
        # The idea is to start from negative phase particles neg_v, run them
        # through a step of Gibbs, and put them into sampler.particles:
        #
        #    neg_v -> Gibbs -> sampler.particles.
        #
        # We control the kind of CD by adjusting what neg_v is: either it is the
        # visible_batch (for CD-1) or it is the old sampler.particles (PCD). We
        # can interpolate between the two algorithms by stochastically choosing
        # either the visible_batch[i] or the old particles[i] on a row-by-row
        # basis.
        #
        if self.conf['CD_anneal_start'] < self.conf['CD_anneal_end']:
            P_restart = clip_ramp(self.iter,
                                  (self.conf['CD_anneal_start'], 1.0),
                                  (self.conf['CD_anneal_end'], 0.0),
                                  dtype=floatX)
            reset_decisions = self.sampler.s_rng.uniform(
                size=(self.conf['batchsize'], )) < P_restart
            v0_ndim = self.visible_batch.ndim
            # broadcast reset_decisions over all but batch idx
            neg_v0 = tensor.switch(
                reset_decisions.dimshuffle(0, *(['x'] * (v0_ndim - 1))),
                self.visible_batch,  # reset the chain to data
                self.sampler.particles)  # continue old chain
        else:
            neg_v0 = self.sampler.particles
        neg_v1 = self.rbm.gibbs_step_for_v(neg_v0, self.sampler.s_rng)
        ups[self.sampler.particles] = neg_v1

        ## N.B. we are manually advancing the sampler, not calling
        ## Gibbs.updates()
        # add_updates(self.sampler.updates())
        learn_rates = [
            self.annealing_coef * self.lr_dict[p] for p in self.rbm.params()
        ]
        add_updates(
            self.rbm.cd_updates(pos_v=self.visible_batch,
                                neg_v=neg_v1,
                                lr=learn_rates,
                                other_cost=sparsity_cost))

        #
        # Gathering statistics of unit activity
        #
        neg_h = self.rbm.mean_h_given_v(neg_v1)
        self.pos_h_means = sharedX(numpy.zeros(self.rbm.h_shp) + 0.5, 'pos_h')
        self.neg_h_means = sharedX(numpy.zeros(self.rbm.h_shp) + 0.5, 'neg_h')
        ups[self.
            pos_h_means] = 0.1 * pos_h.mean(axis=0) + .9 * self.pos_h_means
        ups[self.
            neg_h_means] = 0.1 * neg_h.mean(axis=0) + .9 * self.neg_h_means

        # Clipping parameters to legal ranges
        for clipper in self.clippers:
            ups = clipper.filter_update(ups)
        return ups
Пример #51
0
    def compute_cost(self,
                     features,
                     features_mask,
                     labels,
                     labels_mask,
                     speaker,
                     start_flag,
                     batch_size,
                     raw_audio=None):

        if speaker is None:
            assert not self.use_speaker

        target_features = features[1:]
        mask = features_mask[1:]

        cell_shape = (mask.shape[0], batch_size, self.rnn_h_dim)
        gat_shape = (mask.shape[0], batch_size, 2 * self.rnn_h_dim)
        cell_h1 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h2 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h3 = tensor.zeros(cell_shape, dtype=floatX)
        gat_h1 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h2 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h3 = tensor.zeros(gat_shape, dtype=floatX)

        if self.weak_feedback:
            input_features = features[:-1]

            if self.feedback_noise_level:
                noise = self.theano_rng.normal(size=input_features.shape,
                                               avg=0.,
                                               std=1.)
                input_features += self.noise_level_var * noise

            out_cell_h1, out_gat_h1 = self.out_to_h1.apply(input_features)

            to_normalize = [out_cell_h1, out_gat_h1]
            out_cell_h1, out_gat_h1 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h1 += out_cell_h1
            gat_h1 += out_gat_h1

        if self.full_feedback:
            assert self.weak_feedback
            out_cell_h2, out_gat_h2 = self.out_to_h2.apply(input_features)
            out_cell_h3, out_gat_h3 = self.out_to_h3.apply(input_features)

            to_normalize = [out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3]
            out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h2 += out_cell_h2
            gat_h2 += out_gat_h2
            cell_h3 += out_cell_h3
            gat_h3 += out_gat_h3

        if self.use_speaker:
            speaker = speaker[:, 0]
            emb_speaker = self.embed_speaker.apply(speaker)
            emb_speaker = tensor.shape_padleft(emb_speaker)

            spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker)
            spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker)
            spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker)

            to_normalize = [
                spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3,
                spk_gat_h3
            ]

            spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \
                spk_cell_h3, spk_gat_h3, = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h1 = spk_cell_h1 + cell_h1
            cell_h2 = spk_cell_h2 + cell_h2
            cell_h3 = spk_cell_h3 + cell_h3
            gat_h1 = spk_gat_h1 + gat_h1
            gat_h2 = spk_gat_h2 + gat_h2
            gat_h3 = spk_gat_h3 + gat_h3

        initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \
            initial_w, last_w, initial_k, last_k = \
            self.initial_states(batch_size)

        # If it's a new example, use initial states.
        input_h1 = tensor.switch(start_flag, initial_h1, last_h1)
        input_h2 = tensor.switch(start_flag, initial_h2, last_h2)
        input_h3 = tensor.switch(start_flag, initial_h3, last_h3)
        input_w = tensor.switch(start_flag, initial_w, last_w)
        input_k = tensor.switch(start_flag, initial_k, last_k)

        context_oh = self.encoder.apply(labels) * \
            tensor.shape_padright(labels_mask)

        u = tensor.shape_padleft(tensor.arange(labels.shape[1], dtype=floatX),
                                 2)

        def step(inp_h1_t, gat_h1_t, inp_h2_t, gat_h2_t, inp_h3_t, gat_h3_t,
                 h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, context_oh):

            attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1)
            inp_h1_t += attinp_h1
            gat_h1_t += attgat_h1

            h1_t = self.rnn1.apply(inp_h1_t, gat_h1_t, h1_tm1, iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t) + self.epsilon
            else:
                a_t = tensor.exp(a_t) + self.epsilon

            b_t = tensor.exp(b_t) + self.epsilon
            k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t)

            a_t_ = a_t
            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t *
                                                        (k_t_ - u)**2),
                    axis=1)
            else:
                phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2),
                                   axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1)

            attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t)
            attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t)
            inp_h2_t += attinp_h2
            gat_h2_t += attgat_h2
            inp_h3_t += attinp_h3
            gat_h3_t += attgat_h3

            h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t)
            h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t)

            to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3]
            h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h2_t = self.rnn2.apply(inp_h2_t + h1inp_h2,
                                   gat_h2_t + h1gat_h2,
                                   h2_tm1,
                                   iterate=False)

            h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t)

            to_normalize = [h2inp_h3, h2gat_h3]
            h2inp_h3, h2gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h3_t = self.rnn3.apply(inp_h3_t + h1inp_h3 + h2inp_h3,
                                   gat_h3_t + h1gat_h3 + h2gat_h3,
                                   h3_tm1,
                                   iterate=False)

            return h1_t, h2_t, h3_t, k_t, w_t, phi_t, a_t_

        (h1, h2, h3, k, w, phi, pi_att), scan_updates = theano.scan(
            fn=step,
            sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3],
            non_sequences=[context_oh],
            outputs_info=[
                input_h1, input_h2, input_h3, input_k, input_w, None, None
            ])

        h1_out = self.h1_to_readout.apply(h1)
        h2_out = self.h2_to_readout.apply(h2)
        h3_out = self.h3_to_readout.apply(h3)

        to_normalize = [h1_out, h2_out, h3_out]
        h1_out, h2_out, h3_out = \
            [_apply_norm(x, self.layer_norm) for x in to_normalize]

        readouts = h1_out + h2_out + h3_out

        if self.use_speaker:
            readouts += self.speaker_to_readout.apply(emb_speaker)

        readouts += self.att_to_readout.apply(w)

        predicted = self.readout_to_output.apply(readouts)

        if self.which_cost == 'MSE':
            if self.use_speaker:
                predicted += self.speaker_to_output.apply(emb_speaker)
            cost = tensor.sum((predicted - target_features)**2, axis=-1)

            next_x = predicted
            # Dummy value for coeff
            coeff = predicted
        elif self.which_cost == 'GMM':
            mu, sigma, coeff = predicted
            if self.use_speaker:
                spk_to_out = self.speaker_to_output.apply(emb_speaker)
                mu += spk_to_out[0]
                sigma += spk_to_out[1]
                coeff += spk_to_out[2]

            # When training there should not be sampling_bias
            sigma = tensor.exp(sigma) + self.epsilon

            coeff = tensor.nnet.softmax(coeff.reshape(
                (-1, self.k_gmm))).reshape(coeff.shape) + self.epsilon

            cost = cost_gmm(target_features, mu, sigma, coeff)
            next_x = sample_gmm(mu, sigma, coeff, self.theano_rng)

        cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag

        updates = []
        updates.append((last_h1, h1[-1]))
        updates.append((last_h2, h2[-1]))
        updates.append((last_h3, h3[-1]))
        updates.append((last_k, k[-1]))
        updates.append((last_w, w[-1]))

        cost_raw = None
        if self.raw_output:
            raw_mask = tensor.extra_ops.repeat(features_mask, 80, axis=0)
            raw_mask = raw_mask.dimshuffle(1, 0)

            # breakpointOp = PdbBreakpoint("Raw mask breakpoint")
            # condition = tensor.gt(raw_mask.shape[0], 0)
            # raw_mask = breakpointOp(condition, raw_mask)

            predicted_transposed = predicted.dimshuffle(1, 0, 2)

            last_h0, last_big_h0 = self.sampleRnn.initial_states(batch_size)
            raw_audio_reshaped = raw_audio.dimshuffle(1, 0, 2)
            raw_audio_reshaped = raw_audio_reshaped.reshape(
                (raw_audio_reshaped.shape[0], -1))
            cost_raw, ip_cost, all_params, ip_params, other_params, new_h0, new_big_h0 =\
                self.sampleRnn.apply(raw_audio_reshaped, predicted_transposed, last_h0, last_big_h0, start_flag, raw_mask)

            if self.sampleRnn.N_RNN == 1:
                new_h0 = tensor.unbroadcast(new_h0, 1)
                new_big_h0 = tensor.unbroadcast(new_big_h0, 1)

            updates.append((last_h0, new_h0))
            updates.append((last_big_h0, new_big_h0))
            # cost = cost + 80.*cost_raw
            alpha_ = numpy.float32(0.)
            beta_ = numpy.float32(1.)
            cost = alpha_ * cost + beta_ * cost_raw

        attention_vars = [next_x, k, w, coeff, phi, pi_att]

        return cost, scan_updates + updates, attention_vars, cost_raw
Пример #52
0
    def out_shape(imgshape, ds, ignore_border=False, st=None):
        """Return the shape of the output from this op, for input of given
        shape and flags.

        :param imgshape: the shape of a tensor of images. The last two elements
            are interpreted as the number of rows, and the number of cols.
        :type imgshape: tuple, list, or similar of integer or
            scalar Theano variable.

        :param ds: downsample factor over rows and columns
                   this parameter indicates the size of the pooling region
        :type ds: list or tuple of two ints

        :param st: the stride size. This is the distance between the pooling
                   regions. If it's set to None, in which case it equlas ds.
        :type st: list or tuple of two ints

        :param ignore_border: if ds doesn't divide imgshape, do we include an
            extra row/col of partial downsampling (False) or ignore it (True).
        :type ignore_border: bool

        :rtype: list
        :returns: the shape of the output from this op, for input of given
            shape.  This will have the same length as imgshape, but with last
            two elements reduced as per the downsampling & ignore_border flags.
        """
        if len(imgshape) < 2:
            raise TypeError('imgshape must have at least two elements '
                            '(rows, cols)')

        if st is None:
            st = ds
        r, c = imgshape[-2:]

        if ignore_border:
            out_r = (r - ds[0]) // st[0] + 1
            out_c = (c - ds[1]) // st[1] + 1
            if isinstance(r, theano.Variable):
                nr = tensor.maximum(out_r, 0)
            else:
                nr = numpy.maximum(out_r, 0)
            if isinstance(c, theano.Variable):
                nc = tensor.maximum(out_c, 0)
            else:
                nc = numpy.maximum(out_c, 0)
        else:
            if isinstance(r, theano.Variable):
                nr = tensor.switch(
                    tensor.ge(st[0], ds[0]), (r - 1) // st[0] + 1,
                    tensor.maximum(0, (r - 1 - ds[0]) // st[0] + 1) + 1)
            elif st[0] >= ds[0]:
                nr = (r - 1) // st[0] + 1
            else:
                nr = max(0, (r - 1 - ds[0]) // st[0] + 1) + 1

            if isinstance(c, theano.Variable):
                nc = tensor.switch(
                    tensor.ge(st[1], ds[1]), (c - 1) // st[1] + 1,
                    tensor.maximum(0, (c - 1 - ds[1]) // st[1] + 1) + 1)
            elif st[1] >= ds[1]:
                nc = (c - 1) // st[1] + 1
            else:
                nc = max(0, (c - 1 - ds[1]) // st[1] + 1) + 1

        rval = list(imgshape[:-2]) + [nr, nc]
        return rval
Пример #53
0
 def RMSprop(self,
             cost,
             params,
             full_params,
             sampled_params,
             sidxs,
             epsilon=1e-6):
     grads = [T.grad(cost=cost, wrt=param) for param in params]
     sgrads = [T.grad(cost=cost, wrt=sparam) for sparam in sampled_params]
     updates = OrderedDict()
     if self.grad_cap > 0:
         norm = T.cast(
             T.sqrt(
                 T.sum([
                     T.sum([T.sum(g**2) for g in g_list])
                     for g_list in grads
                 ]) + T.sum([T.sum(g**2) for g in sgrads])),
             theano.config.floatX)
         grads = [[
             T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm,
                      g) for g in g_list
         ] for g_list in grads]
         sgrads = [
             T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm,
                      g) for g in sgrads
         ]
     for p_list, g_list in zip(params, grads):
         for p, g in zip(p_list, g_list):
             if self.adapt:
                 if self.adapt == 'adagrad':
                     g = self.adagrad(p, g, updates)
                 if self.adapt == 'rmsprop':
                     g = self.rmsprop(p, g, updates)
                 if self.adapt == 'adadelta':
                     g = self.adadelta(p, g, updates)
                 if self.adapt == 'adam':
                     g = self.adam(p, g, updates)
             if self.momentum > 0:
                 velocity = theano.shared(p.get_value(borrow=False) * 0.,
                                          borrow=True)
                 velocity2 = self.momentum * velocity - np.float32(
                     self.learning_rate) * (g + self.lmbd * p)
                 updates[velocity] = velocity2
                 updates[p] = p + velocity2
             else:
                 updates[p] = p * np.float32(1.0 - self.learning_rate *
                                             self.lmbd) - np.float32(
                                                 self.learning_rate) * g
     for i in range(len(sgrads)):
         g = sgrads[i]
         fullP = full_params[i]
         sample_idx = sidxs[i]
         sparam = sampled_params[i]
         if self.adapt:
             if self.adapt == 'adagrad':
                 g = self.adagrad(fullP, g, updates, sample_idx)
             if self.adapt == 'rmsprop':
                 g = self.rmsprop(fullP, g, updates, sample_idx)
             if self.adapt == 'adadelta':
                 g = self.adadelta(fullP, g, updates, sample_idx)
             if self.adapt == 'adam':
                 g = self.adam(fullP, g, updates, sample_idx)
         if self.lmbd > 0:
             delta = np.float32(
                 self.learning_rate) * (g + self.lmbd * sparam)
         else:
             delta = np.float32(self.learning_rate) * g
         if self.momentum > 0:
             velocity = theano.shared(fullP.get_value(borrow=False) * 0.,
                                      borrow=True)
             vs = velocity[sample_idx]
             velocity2 = self.momentum * vs - delta
             updates[velocity] = T.set_subtensor(vs, velocity2)
             updates[fullP] = T.inc_subtensor(sparam, velocity2)
         else:
             updates[fullP] = T.inc_subtensor(sparam, -delta)
     return updates
Пример #54
0
 def clear_nan(x):
     return T.switch(T.isnan(x), np.float32(0.0), x)
Пример #55
0
 def leaky(self, X):
     return T.switch(T.ge(X, 0), X, self.leak * X)
Пример #56
0
def build_sampler(tparams, options, trng):
    x = tensor.matrix('x', dtype='int64')
    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # word embedding (source)
    emb = tparams['Wemb'][x.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])

    # encoder
    proj = get_layer(options['encoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='encoder')
    ctx = proj[0][-1]
    ctx_mean = ctx
    init_state = get_layer('ff')[1](tparams,
                                    ctx_mean,
                                    options,
                                    prefix='ff_state',
                                    activ='tanh')

    print 'Building f_init...',
    outs = [init_state, ctx]
    f_init = theano.function([x], outs, name='f_init', profile=profile)
    print 'Done'

    # y: 1 x 1
    y = tensor.vector('y_sampler', dtype='int64')
    init_state = tensor.matrix('init_state', dtype='float32')

    # if it's the first word, emb should be all zero
    emb = tensor.switch(y[:, None] < 0,
                        tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
                        tparams['Wemb_dec'][y])

    # apply one step of gru layer
    proj = get_layer(options['decoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='decoder',
                                            mask=None,
                                            context=ctx,
                                            one_step=True,
                                            init_state=init_state)
    next_state = proj
    ctxs = ctx

    # compute the output probability dist and sample
    logit_lstm = get_layer('ff')[1](tparams,
                                    next_state,
                                    options,
                                    prefix='ff_logit_lstm',
                                    activ='linear')
    logit_prev = get_layer('ff')[1](tparams,
                                    emb,
                                    options,
                                    prefix='ff_logit_prev',
                                    activ='linear')
    logit_ctx = get_layer('ff')[1](tparams,
                                   ctxs,
                                   options,
                                   prefix='ff_logit_ctx',
                                   activ='linear')
    logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx)
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    next_probs = tensor.nnet.softmax(logit)
    next_sample = trng.multinomial(pvals=next_probs).argmax(1)

    # next word probability
    print 'Building f_next..',
    inps = [y, ctx, init_state]
    outs = [next_probs, next_sample, next_state]
    f_next = theano.function(inps, outs, name='f_next', profile=profile)
    print 'Done'

    return f_init, f_next
Пример #57
0
def in_test_phase(x, alt):
    x = T.switch(_LEARNING_PHASE, alt, x)
    x._uses_learning_phase = True
    return x
Пример #58
0
 def elu(self, X):
     return T.switch(T.ge(X, 0), X, self.elu_param * (T.exp(X) - 1))
Пример #59
0
def switch(condition, then_expression, else_expression):
    '''condition: scalar tensor.
    '''
    return T.switch(condition, then_expression, else_expression)
Пример #60
0
        best_ppl = np.inf
        validation_ppl_history = []

        gsums = [theano.shared(np.zeros_like(param.get_value(borrow=True))) for param in net.params]

    cost = net.cost(y) + L2_REG * net.L2_sqr

    gparams = T.grad(cost, net.params)
    updates = OrderedDict()

    # Compute norm of gradients
    norm = T.sqrt(T.sum([T.sum(gparam**2) for gparam in gparams]))

    # Adagrad: "Adaptive subgradient methods for online learning and stochastic optimization" (2011)
    for gparam, param, gsum in zip(gparams, net.params, gsums):
        gparam = T.switch(T.ge(norm, CLIPPING_THRESHOLD), gparam / norm * CLIPPING_THRESHOLD, gparam) # Clipping of gradients
        updates[gsum] = gsum + (gparam**2)
        updates[param] = param - lr * (gparam / (T.sqrt(updates[gsum] + 1e-6)))

    train_model = theano.function(inputs=[x, y, lr], outputs=cost, updates=updates)

    validate_model = theano.function(inputs=[x, y], outputs=net.cost(y))

    print("Training...")
    for epoch in range(starting_epoch, MAX_EPOCHS):
        t0 = time()
        total_neg_log_likelihood = 0
        total_num_output_samples = 0
        iteration = 0
        for X, Y in get_minibatch(data.TRAIN_FILE, MINIBATCH_SIZE, shuffle=True):
            total_neg_log_likelihood += train_model(X, Y, learning_rate)