def backward_V_step(rewards, is_end, next_Vpred, is_tmax, next_Vref, *args #you won't dare delete me ): """scan inner computation step, going backwards in time params: rewards, is_alive, next_Vpred, time_i - sequences next_Vref - recurrent state value for next turn returns: current_Vref - recurrent state value at this turn current_Vref is computed thus: Once every n_steps or at session end: current_Vref = r + gamma*next_Vpred #computation through next predicted state value Otherwise: current_Vref = r + gamma*next_Vref #recurrent computation through next Qvalue """ propagated_Vref = rewards + gamma_or_gammas * next_Vref # propagates value from actual next action optimal_Vref = rewards + gamma_or_gammas * next_Vpred # uses agent's prediction for next state # pick new_Vref if is_Tmax, else propagate existing one chosen_Vref = T.switch(is_tmax, optimal_Vref, propagated_Vref) # zero out references if session has ended already this_Vref = T.switch(is_end, rewards,chosen_Vref) return this_Vref
def get_gradients(self, model, data, ** kwargs): cost = self.expr(model=model, data=data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) if self.gradient_clipping: norm_gs = 0. for grad in gradients.values(): norm_gs += (grad ** 2).sum() not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) norm_gs = T.sqrt(norm_gs) norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude), self.max_magnitude / norm_gs, 1.) for param, grad in gradients.items(): gradients[param] = T.switch(not_finite, .1 * param, grad * norm_gs) updates = OrderedDict() return gradients, updates
def pass_edges(input_idx_t, edge_t, edge_mask_t, counter_t, h_tm1, c_tm1, x): h_t = h_tm1 c_t = c_tm1 # select the input vector to use for this edge (source) x_t_i = x[input_idx_t, :] # zero out the input unless this is a leaf node x_t_0 = T.switch(T.eq(T.sum(edge_mask_t), 0), x_t_i, x_t_i*0) # concatenate with the input edge vector x_t_edge = T.concatenate([x_t_0, edge_t]) # compute attention weights, using a manual softmax attention_scores = T.dot(self.v_a, T.tanh(T.dot(self.W_h_a, h_tm1))) # (1, n_edges) # find the max of the unmasked values max_score = T.max(attention_scores + edge_mask_t * 10000.0) - 10000.0 # exponentiate the differences, masking first to avoid inf, and then to keep only relevant scores exp_scores = T.exp((attention_scores - max_score) * edge_mask_t) * edge_mask_t # take the sum, and add one if the mask is all zeros to avoid an inf exp_scores_sum = T.sum(exp_scores) + T.switch(T.eq(T.sum(edge_mask_t), 0), 1.0, 0.0) # normalize to compute the weights weighted_mask = exp_scores / exp_scores_sum i_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_i) + T.sum(T.dot(self.W_h_i.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_i) f_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_f) + T.sum(T.dot(self.W_h_f.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_f) o_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_o) + T.sum(T.dot(self.W_h_o.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_o) u_t = T.tanh(T.dot(x_t_edge, self.W_x_u) + T.sum(T.dot(self.W_h_u.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_u) c_temp = i_t * u_t + f_t * T.sum((weighted_mask * c_tm1).T, axis=0) h_temp = o_t * T.tanh(c_temp) h_t = T.set_subtensor(h_t[:, counter_t], h_temp) c_t = T.set_subtensor(c_t[:, counter_t], c_temp) return h_t, c_t
def mindist(translate, min_so_far, ro, rd): # ro: 3 # transalate: nbatch * 3 # min_so_far: nbatch * width * height # rd: width * height * 3 ro = ro + translate # d_o = T.dot(rd, ro) # 640, 480 # d_o = dotty(rd, ro, axis=1) d_o = T.tensordot(rd, ro, axes=[2,1]) o_o = T.sum(ro**2,axis=1) b = 2*d_o c = o_o - 0.001 #FIXME, remove this squaring inner = b **2 - 4 * c # 640 480 does_not_intersect = inner < 0.0 minus_b = -b # sqrt_inner = T.sqrt(T.maximum(0.0001, inner)) eps = 1e-9 background_dist = 10.0 sqrt_inner = T.sqrt(T.maximum(eps, inner)) root1 = (minus_b - sqrt_inner)/2.0 root2 = (minus_b + sqrt_inner)/2.0 depth = T.switch(does_not_intersect, background_dist, T.switch(root1 > 0, root1, T.switch(root2 > 0, root2, background_dist))) return T.min([min_so_far, depth], axis=0)
def backward_V_step(rewards,is_alive,next_Vpred,time_i, next_Vref, *args): propagated_Vref = T.switch(is_alive, rewards + gamma_or_gammas * next_Vref, #assumes optimal next action 0. ) if n_steps is None: this_Vref = propagated_Vref else: Vref_at_tmax = T.switch(is_alive, rewards + gamma_or_gammas *next_Vpred, 0. ) this_Vref = T.switch(T.eq(time_i % n_steps,0), #if Tmax Vref_at_tmax, #use special case values propagated_Vref #else use generic ones ) return this_Vref
def __init__(self, random_state=None, low=0.0, high=1.0): super(Uniform, self).__init__(low=low, high=high, random_state=random_state, optimizer=None) # pdf self.pdf_ = T.switch( T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)), 0., 1. / (self.high - self.low)).ravel() self.make_(self.pdf_, "pdf") # -log pdf self.nnlf_ = T.switch( T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)), np.inf, T.log(self.high - self.low)).ravel() self.make_(self.nnlf_, "nnlf") # cdf self.cdf_ = T.switch( T.lt(self.X, self.low), 0., T.switch( T.lt(self.X, self.high), (self.X - self.low) / (self.high - self.low), 1.)).ravel() self.make_(self.cdf_, "cdf") # ppf self.ppf_ = self.p * (self.high - self.low) + self.low self.make_(self.ppf_, "ppf", args=[self.p])
def s_logprior(self, s_params, strength=10.0): # -- I don't know what distribution this would be # but I think it makes a nice shape s_alpha, s_cond_x, s_cond_y = self.unpack(s_params) n_alpha_min = self._alpha_from_l(self._lenscales_min) n_alpha_max = self._alpha_from_l(self._lenscales_max) #return strength * (alpha - alpha_min) ** 2 log0 = -10000 width = n_alpha_max - n_alpha_min #alpha_mean = 0.5 * (alpha_max + alpha_min) energy = strength * 0.5 * (s_alpha - n_alpha_max) ** 2 / width ** 2 lenscale_logprior = TT.switch(s_alpha < n_alpha_min, log0, TT.switch(s_alpha < n_alpha_max, -energy, log0)).sum() if self._conditional: diff_x = s_cond_x diff_y = s_cond_y - 1 rval = (lenscale_logprior + TT.dot(diff_x, diff_x) + TT.dot(diff_y, diff_y)) else: rval = lenscale_logprior assert rval.ndim == 0 return rval
def T_subspacel1_slow_shrinkage(a,L,lam_sparse,lam_slow,small_value=.001): amp = T.sqrt(a[::2,:]**2 + a[1::2,:]**2 + small_value) #damp = amp[:,1:] - amp[:,:-1] # compose slow shrinkage with subspace l1 shrinkage # slow shrinkage div = T.zeros_like(amp) d1 = amp[:,1:] - amp[:,:-1] d2 = d1[:,1:] - d1[:,:-1] div = T.set_subtensor(div[:,1:-1],-d2) div = T.set_subtensor(div[:,0], -d1[:,0]) div = T.set_subtensor(div[:,-1], d1[:,-1]) slow_amp_shrinkage = 1 - (lam_slow/L)*(div/amp) slow_amp_value = T.switch(T.gt(slow_amp_shrinkage,0),slow_amp_shrinkage,0) slow_shrinkage_prox_a = slow_amp_value*a[::2,:] slow_shrinkage_prox_b = slow_amp_value*a[1::2,:] # subspace l1 shrinkage amp_slow_shrinkage_prox = T.sqrt(slow_shrinkage_prox_a**2 + slow_shrinkage_prox_b**2) #amp_shrinkage = 1. - (lam_slow*lam_sparse/L)*amp_slow_shrinkage_prox amp_shrinkage = 1. - (lam_sparse/L)/amp_slow_shrinkage_prox amp_value = T.switch(T.gt(amp_shrinkage,0.),amp_shrinkage,0.) subspacel1_prox = T.zeros_like(a) subspacel1_prox = T.set_subtensor(subspacel1_prox[ ::2,:],amp_value*slow_shrinkage_prox_a) subspacel1_prox = T.set_subtensor(subspacel1_prox[1::2,:],amp_value*slow_shrinkage_prox_b) return subspacel1_prox
def create_cost_fun (self): # create a cost function that # takes each prediction at every timestep # and guesses next timestep's value: what_to_predict = self.input_mat[:, 1:] # because some sentences are shorter, we # place masks where the sentences end: # (for how long is zero indexed, e.g. an example going from `[2,3)`) # has this value set 0 (here we substract by 1): for_how_long = self.for_how_long - 1 # all sentences start at T=0: starting_when = T.zeros_like(self.for_how_long) self.lstm_cost = masked_loss(self.lstm_predictions, what_to_predict, for_how_long, starting_when).sum() zero_entropy = T.zeros_like(self.entropy) real_entropy = T.switch(self.mask_matrix,self.entropy,zero_entropy) zero_key_entropy = T.zeros_like(self.key_entropy) real_key_entropy = T.switch(self.mask_matrix,self.key_entropy,zero_key_entropy) self.final_cost = masked_loss(self.final_predictions, what_to_predict, for_how_long, starting_when).sum()+self.entropy_reg*real_entropy.sum()+self.key_entropy_reg*real_key_entropy.sum()
def train_fprop(self, X, wts=None, bs=None): ''' Performs forward propagation with for training, which could be different from the vanilla frprop we would use for testing, due to extra bells and whistles such as dropout, corruption, etc''' if wts is None and bs is None: wts = self.wts_ bs = self.bs_ if 'dropout' in self.loss_terms: input_p = self.loss_params['input_p'] hidden_p = self.loss_params['hidden_p'] # compute the first activation separately in case we have no hidden # layer; act = self.activs[0]( T.dot(self.dropout(X, input_p), wts[0]) + bs[0]) if len(wts) > 1: # len(wts) = 1 corresponds to softmax regression for i, (w, b, activ) in enumerate(zip(wts[1:], bs[1:], self.activs[1:])): act = activ(T.dot(self.dropout(act, hidden_p), w) + b) eps = 1e-6 act = T.switch(act < eps, eps, act) act = T.switch(act > (1. - eps), (1. - eps), act) return act else: return self.fprop(X, wts, bs)
def T_subspacel1_slow_shrinkage_conv(a, L, lam_sparse, lam_slow, imshp,kshp,featshp,stride=(1,1),small_value=.001): featshp = (imshp[0],kshp[0],featshp[2],featshp[3]) # num images, features, szy, szx features = T.reshape(T.transpose(a),featshp,ndim=4) amp = T.sqrt(features[:,::2,:,:]**2 + features[:,1::2,:,:]**2 + small_value) #damp = amp[:,1:] - amp[:,:-1] # compose slow shrinkage with subspace l1 shrinkage # slow shrinkage div = T.zeros_like(amp) d1 = amp[1:,:,:,:] - amp[:-1,:,:,:] d2 = d1[1:,:,:,:] - d1[:-1,:,:,:] div = T.set_subtensor(div[1:-1,:,:,:], -d2) div = T.set_subtensor(div[0,:,:,:], -d1[0,:,:,:]) div = T.set_subtensor(div[-1,:,:,:], d1[-1,:,:,:]) slow_amp_shrinkage = 1 - (lam_slow / L) * (div / amp) slow_amp_value = T.switch(T.gt(slow_amp_shrinkage, 0), slow_amp_shrinkage, 0) slow_shrinkage_prox_a = slow_amp_value * features[:, ::2, :,:] slow_shrinkage_prox_b = slow_amp_value * features[:,1::2, :,:] # subspace l1 shrinkage amp_slow_shrinkage_prox = T.sqrt(slow_shrinkage_prox_a ** 2 + slow_shrinkage_prox_b ** 2) #amp_shrinkage = 1. - (lam_slow*lam_sparse/L)*amp_slow_shrinkage_prox amp_shrinkage = 1. - (lam_sparse / L) / amp_slow_shrinkage_prox amp_value = T.switch(T.gt(amp_shrinkage, 0.), amp_shrinkage, 0.) subspacel1_prox = T.zeros_like(features) subspacel1_prox = T.set_subtensor(subspacel1_prox[:, ::2, :,:], amp_value * slow_shrinkage_prox_a) subspacel1_prox = T.set_subtensor(subspacel1_prox[:,1::2, :,:], amp_value * slow_shrinkage_prox_b) reshape_subspacel1_prox = T.transpose(T.reshape(subspacel1_prox,(featshp[0],featshp[1]*featshp[2]*featshp[3]),ndim=2)) return reshape_subspacel1_prox
def castray(ro, rd, shape_params, nprims, width, height): tmin = 1.0 tmax = 20.0 precis = 0.002 m = -1.0 # There are a sequence of distances, d1, d2, ..., dn # then theres the accumulated distances d1, d1+d2, d1+d2+d3.... # What we actually want in the output is the sfor each ray the distance to the surface # So we want something like 0, 20, 25, 27, 28, 28, 28, 28, 28 # OK max_num_steps = 25 # distcolors = map(ro + rd * 0, width, height) #FIXME, reshape instead of mul by 0 distcolors = mapedit(ro + rd * 0, shape_params, nprims, width, height) dists = distcolors steps = T.switch(dists < precis, T.zeros_like(dists), T.ones_like(dists)) accum_dists = T.reshape(dists, (width, height, 1)) for i in range(max_num_steps - 1): # distcolors = map(ro + rd * accum_dists, width, height) #FIXME, reshape instead of mul by 0 distcolors = mapedit(ro + rd * accum_dists, shape_params, nprims, width, height) #FIXME, reshape instead of mul by 0 dists = distcolors steps = steps + T.switch(dists < precis, T.zeros_like(dists), T.ones_like(dists)) accum_dists = accum_dists + T.reshape(dists, (width, height, 1)) last_depth = T.reshape(accum_dists, (width, height)) depthmap = T.switch(last_depth < tmax, last_depth / tmax, T.zeros_like(last_depth)) color = 1.0 - steps / float(max_num_steps) # Distance marched along ray and delta between last two steps return depthmap
def mcmc(ll, *frvs): full_observations = dict(observations) full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, frvs)])) loglik = -full_log_likelihood(full_observations) proposals = free_RVs_prop H = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + loglik # -- this should be an inner loop g = [] g.append(tensor.grad(loglik, frvs)) proposals = [(p - epsilon*gg[0]/2.) for p, gg in zip(proposals, g)] rvsp = [(rvs + epsilon*rvp) for rvs,rvp in zip(frvs, proposals)] full_observations = dict(observations) full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, rvsp)])) new_loglik = -full_log_likelihood(full_observations) gnew = [] gnew.append(tensor.grad(new_loglik, rvsp)) proposals = [(p - epsilon*gn[0]/2.) for p, gn in zip(proposals, gnew)] # -- Hnew = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + new_loglik dH = Hnew - H accept = tensor.or_(dH < 0., U < tensor.exp(-dH)) return [tensor.switch(accept, -new_loglik, ll)] + \ [tensor.switch(accept, p, f) for p, f in zip(rvsp, frvs)], \ {}, theano.scan_module.until(accept)
def convert_method(self, method_string): if method_string == 'sigmoid': return Tensor.nnet.sigmoid elif method_string == 'tanh': return Tensor.tanh elif method_string == 'scaled_tanh': return lambda x: 1.7159 * Tensor.tanh(0.66 * x) elif method_string == 'soft_sigmoid': return soft_sigmoid elif method_string == 'relu': return lambda x: x * (x > 0) elif method_string == 'relu2': return lambda x: Tensor.switch(Tensor.lt(x, -1), -1, x) * Tensor.switch(Tensor.gt(x, 1), 1, x) / x elif method_string == 'leakyrelu': return lambda x: x * (x > 0) + 0.01 * x * (x < 0) elif method_string == 'shiftedrelu': return lambda x: x * (x > -1) elif method_string == 'hard_sigmoid': return Tensor.nnet.hard_sigmoid elif method_string == 'none': return lambda x: x else: raise Exception('method unknown')
def out_shape(imgshape, ds, ignore_border=False): """Return the shape of the output from this op, for input of given shape and flags. :param imgshape: the shape of a tensor of images. The last two elements are interpreted as the number of rows, and the number of cols. :type imgshape: tuple, list, or similar of integer or scalar Theano variable. :param ds: downsample factor over rows and columns :type ds: list or tuple of two ints :param ignore_border: if ds doesn't divide imgshape, do we include an extra row/col of partial downsampling (False) or ignore it (True). :type ignore_border: bool :rtype: list :returns: the shape of the output from this op, for input of given shape. This will have the same length as imgshape, but with last two elements reduced as per the downsampling & ignore_border flags. """ if len(imgshape) < 2: raise TypeError("imgshape must have at least two elements (rows, cols)") r, c = imgshape[-2:] rval = list(imgshape[:-2]) + [r // ds[0], c // ds[1]] if not ignore_border: if isinstance(r, theano.Variable): rval[-2] = tensor.switch(r % ds[0], rval[-2] + 1, rval[-2]) elif r % ds[0]: rval[-2] += 1 if isinstance(c, theano.Variable): rval[-1] = tensor.switch(c % ds[1], rval[-1] + 1, rval[-1]) elif c % ds[1]: rval[-1] += 1 return rval
def multiple_switch(*args): """ .. todo:: WRITEME properly Applies a cascade of ifelse. The output will be a Theano expression which evaluates: .. code-block:: none if args0: then arg1 elif arg2: then arg3 elif arg4: then arg5 .... """ if len(args) == 3: return T.switch(*args) else: return T.switch(args[0], args[1], multiple_switch(*args[2:]))
def theano_sentence_prediction(self, Sentence, Chars, WordLengths): input_lstm_res_f = self.input_lstm_forward_layer.function(Sentence, Chars, WordLengths) input_lstm_res_b = self.input_lstm_backward_layer.function(Sentence, Chars, WordLengths) input_combined = T.concatenate((input_lstm_res_f, input_lstm_res_b), axis=1) #Make pairwise features. This is really just "tensor product with concatenation instead of multiplication". Is there a command for that? full_matrix, _ = theano.scan(fn=self.__pairwise_features, outputs_info=None, sequences=input_combined, non_sequences=[input_combined, Sentence.shape[0]]) if len(self.lstm_layers) > 0 and self.lstm_layers[0].training: srng = RandomStreams(seed=12345) full_matrix = T.switch(srng.binomial(size=(Sentence.shape[0], Sentence.shape[0]+1, self.hidden_dimension*4), p=0.5), full_matrix, 0) else: full_matrix = 0.5 * full_matrix full_matrix = self.transition_layer.function(full_matrix) for layer in self.lstm_layers: if layer.training: print("hah-train") full_matrix = T.switch(srng.binomial(size=(Sentence.shape[0], Sentence.shape[0]+1, self.hidden_dimension*4), p=0.5), full_matrix, 0) else: print("heh-notrain") full_matrix = 0.5 * full_matrix full_matrix = layer.function(full_matrix) final_matrix = self.output_convolution.function(full_matrix) return T.nnet.softmax(final_matrix)
def hmc_updates(positions, stepsize, avg_acceptance_rate, final_pos, accept, target_acceptance_rate, stepsize_inc, stepsize_dec, stepsize_min, stepsize_max, avg_acceptance_slowness): # broadcast `accept` scalar to tensor with the same dimensions as final_pos. accept_matrix = accept.dimshuffle(0, *(('x',) * (final_pos.ndim - 1))) # if accept is True, update to `final_pos` else stay put new_positions = TT.switch(accept_matrix, final_pos, positions) ## STEPSIZE UPDATES ## # if acceptance rate is too low, our sampler is too "noisy" and we reduce # the stepsize. If it is too high, our sampler is too conservative, we can # get away with a larger stepsize (resulting in better mixing). _new_stepsize = TT.switch(avg_acceptance_rate > target_acceptance_rate, stepsize * stepsize_inc, stepsize * stepsize_dec) # maintain stepsize in [stepsize_min, stepsize_max] new_stepsize = TT.clip(_new_stepsize, stepsize_min, stepsize_max) # perform exponential moving average mean_dtype = theano.scalar.upcast(accept.dtype, avg_acceptance_rate.dtype) new_acceptance_rate = TT.add( avg_acceptance_slowness * avg_acceptance_rate, (1.0 - avg_acceptance_slowness) * accept.mean(dtype=mean_dtype)) return [(positions, new_positions), (stepsize, new_stepsize), (avg_acceptance_rate, new_acceptance_rate)]
def step(self, inputs, inputs_mask, h_tm1, c_tm1, h_mask, *non_sequences): if self.gate.use_attention: # attended is the # src_sequence_length x batch_size x attention_dims # matrix which we have attention on. # # attended_dot_u is the h_t-independent part of the final # attention vectors, which is precomputed for efficiency. # # attention_mask is a binary mask over the valid elements of # attended, which in practice is the same as the mask passed to # the encoder that created attended. Size # src_sequence_length x batch_size h_t, c_t, attention = self.gate( inputs, h_tm1 * h_mask.astype(theano.config.floatX), c_tm1, attended=non_sequences[0], attended_dot_u=non_sequences[1], attention_mask=non_sequences[2]) return (T.switch(inputs_mask.dimshuffle(0, 'x'), h_t, h_tm1), T.switch(inputs_mask.dimshuffle(0, 'x'), c_t, c_tm1), attention) else: h_t, c_t = self.gate( inputs, h_tm1 * h_mask.astype(theano.config.floatX), c_tm1) return (T.switch(inputs_mask.dimshuffle(0, 'x'), h_t, h_tm1), T.switch(inputs_mask.dimshuffle(0, 'x'), c_t, c_tm1))
def cd_updates(self): """ Return a dictionary of shared variable updates that implements contrastive divergence learning by stochastic gradient descent with an annealed learning rate. """ ups = {} if self.persistent_chains: grads = self.contrastive_grads() ups.update(dict(self.sampler.updates())) else: cd1_sampler, final_p, cd1_updates = self.rbm.CD1_sampler(self.visible_batch, self.batchsize) self._last_cd1_sampler = cd1_sampler # hacked in here for the unit test #ignore the cd1_sampler grads = self.contrastive_grads(neg_v = final_p) ups.update(dict(cd1_updates)) # contrastive divergence updates # TODO: sgd_updates is a particular optization algo (others are possible) # parametrize so that algo is plugin # the normalization normVF might be sgd-specific though... # TODO: when sgd has an annealing schedule, this should # go through that mechanism. lr = TT.clip( self.learn_rate * TT.cast(self.lr_anneal_start / (self.iter+1), floatX), 0.0, #min self.learn_rate) #max ups.update(dict(sgd_updates( self.rbm.params(), grads, stepsizes=[a*lr for a in self.learn_rate_multipliers]))) ups[self.iter] = self.iter + 1 # add trainer updates (replace CD update of U) ups[self.rbm.U], ups[self.normVF] = self.normalize_U(ups[self.rbm.U]) #l1_updates: if (self.l1_penalty_start > 0) and (self.l1_penalty != 0.0): ups[self.effective_l1_penalty] = TT.switch( self.iter >= self.l1_penalty_start, self.l1_penalty, 0.0) if getattr(self,'p_lr', None): ups[self.p_lr] = TT.switch(self.iter > self.p_training_start, self.p_training_lr, 0) new_P = ups[self.rbm.P] * self.p_mask no_pos_P = TT.switch(new_P<0, new_P, 0) ups[self.rbm.P] = - no_pos_P / no_pos_P.sum(axis=0) #normalize to that columns sum 1 return ups
def __call__(self, input_): m = input_.mean() v = input_.std() new_m = T.switch(T.eq(self.m, 0.), m, (np.float32(1.) - self.rate) * self.m + self.rate * m) new_var = T.switch(T.eq(self.var, 0.), v, (np.float32(1.) - self.rate) * self.var + self.rate * v) updates = [(self.m, new_m), (self.var, new_var)] input_centered = ( (input_ - new_m) / T.maximum(1., T.sqrt(new_var))) input_ = T.zeros_like(input_) + input_ outs = OrderedDict( x=input_, x_centered=input_centered, m=new_m, var=new_var ) return outs, updates
def compute_updates(self, training_cost, params): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Clip stuff c = numpy.float32(self.cutoff) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) if self.updater == 'adagrad': updates = Adagrad(grads, self.lr) elif self.updater == 'sgd': raise Exception("Sgd not implemented!") elif self.updater == 'adadelta': updates = Adadelta(grads) elif self.updater == 'rmsprop': updates = RMSProp(grads, self.lr) elif self.updater == 'adam': updates = Adam(grads) else: raise Exception("Updater not understood!") return updates
def _activation(self, Y, L, M, W): """Returns the activation for a given input. Derived from the generative model formulation of hierarchical Poisson mixtures, the formular for the activation in the network reads as follows: I_c = \sum_d \log(W_{cd})y_d + \log(M_{lc}) for labeled data \sum_d \log(W_{cd})y_d + \log(\sum_k M_{kc}) for unlabeled data s_c = softmax(I_c) """ # first: complete inference to find label # Input integration: I = T.tensordot(Y,T.log(W),axes=[1,1]) # recurrent term: vM = M[L] L_index = T.eq(L,-1).nonzero() vM = T.set_subtensor(vM[L_index], T.sum(M, axis=0)) # numeric trick to prevent overflow in the exp-function max_exponent = 86. - T.ceil(T.log(I.shape[1].astype('float32'))) scale = T.switch( T.gt(T.max(I, axis=1, keepdims=True), max_exponent), T.max(I, axis=1, keepdims=True) - max_exponent, 0.) # numeric approximation to prevent underflow in the exp-function: # map too low values of I to a fixed minimum value min_exponent = -87. + T.ceil(T.log(I.shape[1].astype('float32'))) I = T.switch( T.lt(I-scale, min_exponent), scale+min_exponent, I) # activation: recurrent softmax with overflow protection s = vM*T.exp(I-scale)/T.sum(vM*T.exp(I-scale), axis=1, keepdims=True) return s
def adamgc(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, max_magnitude=5.0, infDecay=0.1): updates = [] grads = T.grad(cost, params) norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.) i = shared(floatX(0.)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) m = shared(p.get_value() * 0.) v = shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates, norm
def _get_targets(y, log_y_hat, y_mask, y_hat_mask): ''' Returns the target values according to the CTC cost with respect to y_hat. Note that this is part of the gradient with respect to the softmax output and not with respect to the input of the original softmax function. All computations are done in log scale ''' num_classes = log_y_hat.shape[2] - 1 blanked_y, blanked_y_mask = _add_blanks( y=y, blank_symbol=num_classes, y_mask=y_mask) log_alpha, log_beta = _log_forward_backward(blanked_y, log_y_hat, blanked_y_mask, y_hat_mask, num_classes) # explicitly not using a mask to prevent inf - inf y_prob = _class_batch_to_labeling_batch(blanked_y, log_y_hat, y_hat_mask=None) marginals = log_alpha + log_beta - y_prob max_marg = marginals.max(2) max_marg = T.switch(T.le(max_marg, -np.inf), 0, max_marg) log_Z = T.log(T.exp(marginals - max_marg[:,:, None]).sum(2)) log_Z = log_Z + max_marg log_Z = T.switch(T.le(log_Z, -np.inf), 0, log_Z) targets = _labeling_batch_to_class_batch(blanked_y, T.exp(marginals - log_Z[:,:, None]), num_classes + 1) return targets
def dlogp(inputs, gradients): g_logp, = gradients cov, delta = inputs g_logp.tag.test_value = floatX(1.) n, k = delta.shape chol_cov = cholesky(cov) diag = tt.nlinalg.diag(chol_cov) ok = tt.all(diag > 0) chol_cov = tt.switch(ok, chol_cov, tt.fill(chol_cov, 1)) delta_trans = solve_lower(chol_cov, delta.T).T inner = n * tt.eye(k) - tt.dot(delta_trans.T, delta_trans) g_cov = solve_upper(chol_cov.T, inner) g_cov = solve_upper(chol_cov.T, g_cov.T) tau_delta = solve_upper(chol_cov.T, delta_trans.T) g_delta = tau_delta.T g_cov = tt.switch(ok, g_cov, -np.nan) g_delta = tt.switch(ok, g_delta, -np.nan) return [-0.5 * g_cov * g_logp, -g_delta * g_logp]
def __init__(self, rng, f='ReLU', g=lambda x: x, params=None): if f == 'ReLU': if hasattr(T.nnet, 'relu'): self.f = T.nnet.relu else: self.f = lambda x: T.switch(x<0,0,x) self.g = lambda x: x elif f == 'PReLU': # Avoids dying ReLU units if hasattr(T.nnet, 'relu'): self.f = lambda x: T.nnet.relu(x, alpha=0.01) else: self.f = lambda x: T.switch(x<=0,a*x,x) self.g = lambda x: x elif f == 'tanh': self.f = T.tanh self.g = T.arctanh elif f == 'sigmoid': self.f = T.nnet.sigmoid self.g = lambda x: x elif f == 'softmax': self.f = T.nnet.softmax self.g = lambda x: x elif f == 'softplus': self.f = T.nnet.softplus self.g = lambda x: x elif f == 'identity': self.f = lambda x: x self.g = lambda x: x else: self.f = f self.g = g self.params = [] if params is None else params
def mixture_model(random_seed=1234): """Sample mixture model to use in benchmarks""" np.random.seed(1234) size = 1000 w_true = np.array([0.35, 0.4, 0.25]) mu_true = np.array([0., 2., 5.]) sigma = np.array([0.5, 0.5, 1.]) component = np.random.choice(mu_true.size, size=size, p=w_true) x = np.random.normal(mu_true[component], sigma[component], size=size) with pm.Model() as model: w = pm.Dirichlet('w', a=np.ones_like(w_true)) mu = pm.Normal('mu', mu=0., sd=10., shape=w_true.shape) enforce_order = pm.Potential('enforce_order', tt.switch(mu[0] - mu[1] <= 0, 0., -np.inf) + tt.switch(mu[1] - mu[2] <= 0, 0., -np.inf)) tau = pm.Gamma('tau', alpha=1., beta=1., shape=w_true.shape) pm.NormalMixture('x_obs', w=w, mu=mu, tau=tau, observed=x) # Initialization can be poorly specified, this is a hack to make it work start = { 'mu': mu_true.copy(), 'tau_log__': np.log(1. / sigma**2), 'w_stickbreaking__': np.array([-0.03, 0.44]) } return model, start
def theano_digitize(x, bins): """ Equivalent to numpy digitize. Parameters ---------- x : Theano tensor or array_like The array or matrix to be digitized bins : array_like The bins with which x should be digitized Returns ------- A Theano tensor The indices of the bins to which each value in input array belongs. """ binned = T.zeros_like(x) + len(bins) for i in range(len(bins)): bin=bins[i] if i == 0: binned=T.switch(T.lt(x,bin),i,binned) else: ineq = T.and_(T.ge(x,bins[i-1]),T.lt(x,bin)) binned=T.switch(ineq,i,binned) binned=T.switch(T.isnan(x), len(bins), binned) return binned
def tnormal_icdf(size, avg, std, lbound, ubound, theano_rng, dtype): """ Alternative Method: sample = -Phi_inv(Phi(-lbound)*(1-u) + Phi(-ubound)*u) """ def Phi(x): erfarg = (x - avg) / (std * SQRT2) rval = 0.5 * (1. + T.erf(erfarg)) return rval.astype(dtype) def Phi_inv(y, eps=3e-8): """ eps was calibrated for cublas.erfinv using float32 """ temp = 2. * y - 1. erfinv_input = T.clip(temp, -1+eps, 1-eps) rval = avg + std * SQRT2 * T.erfinv(erfinv_input) return rval.astype(dtype) # center lower and upper bounds based on mean u = theano_rng.uniform(size=size, dtype=dtype) # Inverse CDF method. When method becomes numerically unstable, we simply # return the bounds based on whether avg < lbound, or ubound < avg. cdf_range = Phi(ubound) - Phi(lbound) sample = T.switch( T.or_( T.lt(cdf_range, 3e-8), T.gt(cdf_range, 1-3e-8)), T.switch( T.lt(avg, lbound), lbound, ubound), Phi_inv(Phi(lbound) + u * cdf_range)) return sample
def __call__(self, x): dropped_units = self.rng.binomial( n=1, p=self.dropout_rate, size=x.shape if self.shape is None else self.shape) return tt.switch(dropped_units, 0, x)
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pre_emb, crf, cap_dim, training=True, **kwargs): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) # Number of capitalization features if cap_dim: n_cap = 4 # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: new_weights = word_layer.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[re.sub( '\d', '0', word.lower())] c_zeros += 1 word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained embeddings.' % len(pretrained) print( '%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words) print( '%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % (c_found, c_lower, c_zeros) # # Chars inputs # if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] char_rev_output = char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim # # Capitalization feature # if cap_dim: input_dim += cap_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) # Prepare final input inputs = T.concatenate(inputs, axis=1) if len(inputs) != 1 else inputs[0] # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: final_output = T.concatenate([word_for_output, word_rev_output], axis=1) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_for_output # Sentence to Named Entity tags - Score final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1) observations = T.concatenate([b_s, observations, e_s], axis=0) # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Score from transitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1]].sum() all_paths_scores = forward(observations, transitions) cost = -(real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(transitions) params.append(transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: updates = Optimization(clip=5.0).get_updates( lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function(inputs=train_inputs, outputs=cost, updates=updates, givens=({ is_train: np.cast['int32'](1) } if dropout else {})) else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function(inputs=eval_inputs, outputs=tags_scores, givens=({ is_train: np.cast['int32'](0) } if dropout else {})) else: f_eval = theano.function(inputs=eval_inputs, outputs=forward( observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({ is_train: np.cast['int32'](0) } if dropout else {})) return f_train, f_eval
def main(args): trial = int(args['trial']) pkl_name = 'vrnn_gmm_%d' % trial channel_name = 'valid_nll_upper_bound' data_path = args['data_path'] save_path = args['save_path'] monitoring_freq = int(args['monitoring_freq']) force_saving_freq = int(args['force_saving_freq']) reset_freq = int(args['reset_freq']) epoch = int(args['epoch']) batch_size = int(args['batch_size']) m_batch_size = int(args['m_batch_size']) x_dim = int(args['x_dim']) z_dim = int(args['z_dim']) rnn_dim = int(args['rnn_dim']) k = int(args['num_k']) lr = float(args['lr']) debug = int(args['debug']) print "trial no. %d" % trial print "batch size %d" % batch_size print "learning rate %f" % lr print "saving pkl file '%s'" % pkl_name print "to the save path '%s'" % save_path q_z_dim = 500 p_z_dim = 500 p_x_dim = 500 x2s_dim = 500 z2s_dim = 500 target_dim = x_dim * k file_name = 'blizzard_unseg_tbptt' normal_params = np.load(data_path + file_name + '_normal.npz') X_mean = normal_params['X_mean'] X_std = normal_params['X_std'] model = Model() train_data = Blizzard_tbptt(name='train', path=data_path, frame_size=x_dim, file_name=file_name, X_mean=X_mean, X_std=X_std) valid_data = Blizzard_tbptt(name='valid', path=data_path, frame_size=x_dim, file_name=file_name, X_mean=X_mean, X_std=X_std) x = train_data.theano_vars() m_x = valid_data.theano_vars() if debug: x.tag.test_value = np.zeros((15, batch_size, x_dim), dtype=theano.config.floatX) m_x.tag.test_value = np.zeros((15, m_batch_size, x_dim), dtype=theano.config.floatX) init_W = InitCell('rand') init_U = InitCell('ortho') init_b = InitCell('zeros') init_b_sig = InitCell('const', mean=0.6) x_1 = FullyConnectedLayer(name='x_1', parent=['x_t'], parent_dim=[x_dim], nout=x2s_dim, unit='relu', init_W=init_W, init_b=init_b) x_2 = FullyConnectedLayer(name='x_2', parent=['x_1'], parent_dim=[x2s_dim], nout=x2s_dim, unit='relu', init_W=init_W, init_b=init_b) x_3 = FullyConnectedLayer(name='x_3', parent=['x_2'], parent_dim=[x2s_dim], nout=x2s_dim, unit='relu', init_W=init_W, init_b=init_b) x_4 = FullyConnectedLayer(name='x_4', parent=['x_3'], parent_dim=[x2s_dim], nout=x2s_dim, unit='relu', init_W=init_W, init_b=init_b) z_1 = FullyConnectedLayer(name='z_1', parent=['z_t'], parent_dim=[z_dim], nout=z2s_dim, unit='relu', init_W=init_W, init_b=init_b) z_2 = FullyConnectedLayer(name='z_2', parent=['z_1'], parent_dim=[z2s_dim], nout=z2s_dim, unit='relu', init_W=init_W, init_b=init_b) z_3 = FullyConnectedLayer(name='z_3', parent=['z_2'], parent_dim=[z2s_dim], nout=z2s_dim, unit='relu', init_W=init_W, init_b=init_b) z_4 = FullyConnectedLayer(name='z_4', parent=['z_3'], parent_dim=[z2s_dim], nout=z2s_dim, unit='relu', init_W=init_W, init_b=init_b) rnn = LSTM(name='rnn', parent=['x_4', 'z_4'], parent_dim=[x2s_dim, z2s_dim], nout=rnn_dim, unit='tanh', init_W=init_W, init_U=init_U, init_b=init_b) phi_1 = FullyConnectedLayer(name='phi_1', parent=['x_4', 's_tm1'], parent_dim=[x2s_dim, rnn_dim], nout=q_z_dim, unit='relu', init_W=init_W, init_b=init_b) phi_2 = FullyConnectedLayer(name='phi_2', parent=['phi_1'], parent_dim=[q_z_dim], nout=q_z_dim, unit='relu', init_W=init_W, init_b=init_b) phi_3 = FullyConnectedLayer(name='phi_3', parent=['phi_2'], parent_dim=[q_z_dim], nout=q_z_dim, unit='relu', init_W=init_W, init_b=init_b) phi_4 = FullyConnectedLayer(name='phi_4', parent=['phi_3'], parent_dim=[q_z_dim], nout=q_z_dim, unit='relu', init_W=init_W, init_b=init_b) phi_mu = FullyConnectedLayer(name='phi_mu', parent=['phi_4'], parent_dim=[q_z_dim], nout=z_dim, unit='linear', init_W=init_W, init_b=init_b) phi_sig = FullyConnectedLayer(name='phi_sig', parent=['phi_4'], parent_dim=[q_z_dim], nout=z_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) prior_1 = FullyConnectedLayer(name='prior_1', parent=['s_tm1'], parent_dim=[rnn_dim], nout=p_z_dim, unit='relu', init_W=init_W, init_b=init_b) prior_2 = FullyConnectedLayer(name='prior_2', parent=['prior_1'], parent_dim=[p_z_dim], nout=p_z_dim, unit='relu', init_W=init_W, init_b=init_b) prior_3 = FullyConnectedLayer(name='prior_3', parent=['prior_2'], parent_dim=[p_z_dim], nout=p_z_dim, unit='relu', init_W=init_W, init_b=init_b) prior_4 = FullyConnectedLayer(name='prior_4', parent=['prior_3'], parent_dim=[p_z_dim], nout=p_z_dim, unit='relu', init_W=init_W, init_b=init_b) prior_mu = FullyConnectedLayer(name='prior_mu', parent=['prior_4'], parent_dim=[p_z_dim], nout=z_dim, unit='linear', init_W=init_W, init_b=init_b) prior_sig = FullyConnectedLayer(name='prior_sig', parent=['prior_4'], parent_dim=[p_z_dim], nout=z_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) theta_1 = FullyConnectedLayer(name='theta_1', parent=['z_4', 's_tm1'], parent_dim=[z2s_dim, rnn_dim], nout=p_x_dim, unit='relu', init_W=init_W, init_b=init_b) theta_2 = FullyConnectedLayer(name='theta_2', parent=['theta_1'], parent_dim=[p_x_dim], nout=p_x_dim, unit='relu', init_W=init_W, init_b=init_b) theta_3 = FullyConnectedLayer(name='theta_3', parent=['theta_2'], parent_dim=[p_x_dim], nout=p_x_dim, unit='relu', init_W=init_W, init_b=init_b) theta_4 = FullyConnectedLayer(name='theta_4', parent=['theta_3'], parent_dim=[p_x_dim], nout=p_x_dim, unit='relu', init_W=init_W, init_b=init_b) theta_mu = FullyConnectedLayer(name='theta_mu', parent=['theta_4'], parent_dim=[p_x_dim], nout=target_dim, unit='linear', init_W=init_W, init_b=init_b) theta_sig = FullyConnectedLayer(name='theta_sig', parent=['theta_4'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) coeff = FullyConnectedLayer(name='coeff', parent=['theta_4'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b) nodes = [ rnn, x_1, x_2, x_3, x_4, z_1, z_2, z_3, z_4, phi_1, phi_2, phi_3, phi_4, phi_mu, phi_sig, prior_1, prior_2, prior_3, prior_4, prior_mu, prior_sig, theta_1, theta_2, theta_3, theta_4, theta_mu, theta_sig, coeff ] params = OrderedDict() for node in nodes: if node.initialize() is not None: params.update(node.initialize()) params = init_tparams(params) step_count = sharedX(0, name='step_count') last_rnn = np.zeros((batch_size, rnn_dim * 2), dtype=theano.config.floatX) rnn_tm1 = sharedX(last_rnn, name='rnn_tm1') shared_updates = OrderedDict() shared_updates[step_count] = step_count + 1 s_0 = T.switch(T.eq(T.mod(step_count, reset_freq), 0), rnn.get_init_state(batch_size), rnn_tm1) x_shape = x.shape x_in = x.reshape((x_shape[0] * x_shape[1], -1)) x_1_in = x_1.fprop([x_in], params) x_2_in = x_2.fprop([x_1_in], params) x_3_in = x_3.fprop([x_2_in], params) x_4_in = x_4.fprop([x_3_in], params) x_4_in = x_4_in.reshape((x_shape[0], x_shape[1], -1)) def inner_fn(x_t, s_tm1): phi_1_t = phi_1.fprop([x_t, s_tm1], params) phi_2_t = phi_2.fprop([phi_1_t], params) phi_3_t = phi_3.fprop([phi_2_t], params) phi_4_t = phi_4.fprop([phi_3_t], params) phi_mu_t = phi_mu.fprop([phi_4_t], params) phi_sig_t = phi_sig.fprop([phi_4_t], params) prior_1_t = prior_1.fprop([s_tm1], params) prior_2_t = prior_2.fprop([prior_1_t], params) prior_3_t = prior_3.fprop([prior_2_t], params) prior_4_t = prior_4.fprop([prior_3_t], params) prior_mu_t = prior_mu.fprop([prior_4_t], params) prior_sig_t = prior_sig.fprop([prior_4_t], params) z_t = Gaussian_sample(phi_mu_t, phi_sig_t) z_1_t = z_1.fprop([z_t], params) z_2_t = z_2.fprop([z_1_t], params) z_3_t = z_3.fprop([z_2_t], params) z_4_t = z_4.fprop([z_3_t], params) s_t = rnn.fprop([[x_t, z_4_t], [s_tm1]], params) return s_t, phi_mu_t, phi_sig_t, prior_mu_t, prior_sig_t, z_4_t ((s_temp, phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp, z_4_temp), updates) =\ theano.scan(fn=inner_fn, sequences=[x_4_in], outputs_info=[s_0, None, None, None, None, None]) for k, v in updates.iteritems(): k.default_update = v shared_updates[rnn_tm1] = s_temp[-1] s_temp = concatenate([s_0[None, :, :], s_temp[:-1]], axis=0) theta_1_temp = theta_1.fprop([z_4_temp, s_temp], params) theta_2_temp = theta_2.fprop([theta_1_temp], params) theta_3_temp = theta_3.fprop([theta_2_temp], params) theta_4_temp = theta_4.fprop([theta_3_temp], params) theta_mu_temp = theta_mu.fprop([theta_4_temp], params) theta_sig_temp = theta_sig.fprop([theta_4_temp], params) coeff_temp = coeff.fprop([theta_4_temp], params) kl_temp = KLGaussianGaussian(phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp) x_shape = x.shape x_in = x.reshape((x_shape[0] * x_shape[1], -1)) theta_mu_in = theta_mu_temp.reshape((x_shape[0] * x_shape[1], -1)) theta_sig_in = theta_sig_temp.reshape((x_shape[0] * x_shape[1], -1)) coeff_in = coeff_temp.reshape((x_shape[0] * x_shape[1], -1)) recon = GMM(x_in, theta_mu_in, theta_sig_in, coeff_in) recon_term = recon.mean() kl_term = kl_temp.mean() nll_upper_bound = recon_term + kl_term nll_upper_bound.name = 'nll_upper_bound' m_x_1_temp = x_1.fprop([m_x], params) m_x_2_temp = x_2.fprop([m_x_1_temp], params) m_x_3_temp = x_3.fprop([m_x_2_temp], params) m_x_4_temp = x_4.fprop([m_x_3_temp], params) m_s_0 = rnn.get_init_state(m_batch_size) ((m_s_temp, m_phi_mu_temp, m_phi_sig_temp, m_prior_mu_temp, m_prior_sig_temp, m_z_4_temp), m_updates) =\ theano.scan(fn=inner_fn, sequences=[m_x_4_temp], outputs_info=[m_s_0, None, None, None, None, None]) for k, v in m_updates.iteritems(): k.default_update = v m_s_temp = concatenate([m_s_0[None, :, :], m_s_temp[:-1]], axis=0) m_theta_1_temp = theta_1.fprop([m_z_4_temp, m_s_temp], params) m_theta_2_temp = theta_2.fprop([m_theta_1_temp], params) m_theta_3_temp = theta_3.fprop([m_theta_2_temp], params) m_theta_4_temp = theta_4.fprop([m_theta_3_temp], params) m_theta_mu_temp = theta_mu.fprop([m_theta_4_temp], params) m_theta_sig_temp = theta_sig.fprop([m_theta_4_temp], params) m_coeff_temp = coeff.fprop([m_theta_4_temp], params) m_kl_temp = KLGaussianGaussian(m_phi_mu_temp, m_phi_sig_temp, m_prior_mu_temp, m_prior_sig_temp) m_x_shape = m_x.shape m_x_in = m_x.reshape((m_x_shape[0] * m_x_shape[1], -1)) m_theta_mu_in = m_theta_mu_temp.reshape((m_x_shape[0] * m_x_shape[1], -1)) m_theta_sig_in = m_theta_sig_temp.reshape( (m_x_shape[0] * m_x_shape[1], -1)) m_coeff_in = m_coeff_temp.reshape((m_x_shape[0] * m_x_shape[1], -1)) m_recon = GMM(m_x_in, m_theta_mu_in, m_theta_sig_in, m_coeff_in) m_recon_term = m_recon.mean() m_kl_term = m_kl_temp.mean() m_nll_upper_bound = m_recon_term + m_kl_term m_nll_upper_bound.name = 'nll_upper_bound' m_recon_term.name = 'recon_term' m_kl_term.name = 'kl_term' max_x = m_x.max() mean_x = m_x.mean() min_x = m_x.min() max_x.name = 'max_x' mean_x.name = 'mean_x' min_x.name = 'min_x' max_theta_mu = m_theta_mu_in.max() mean_theta_mu = m_theta_mu_in.mean() min_theta_mu = m_theta_mu_in.min() max_theta_mu.name = 'max_theta_mu' mean_theta_mu.name = 'mean_theta_mu' min_theta_mu.name = 'min_theta_mu' max_theta_sig = m_theta_sig_in.max() mean_theta_sig = m_theta_sig_in.mean() min_theta_sig = m_theta_sig_in.min() max_theta_sig.name = 'max_theta_sig' mean_theta_sig.name = 'mean_theta_sig' min_theta_sig.name = 'min_theta_sig' max_phi_sig = m_phi_sig_temp.max() mean_phi_sig = m_phi_sig_temp.mean() min_phi_sig = m_phi_sig_temp.min() max_phi_sig.name = 'max_phi_sig' mean_phi_sig.name = 'mean_phi_sig' min_phi_sig.name = 'min_phi_sig' max_prior_sig = m_prior_sig_temp.max() mean_prior_sig = m_prior_sig_temp.mean() min_prior_sig = m_prior_sig_temp.min() max_prior_sig.name = 'max_prior_sig' mean_prior_sig.name = 'mean_prior_sig' min_prior_sig.name = 'min_prior_sig' model.inputs = [x] model.params = params model.nodes = nodes model.set_updates(shared_updates) optimizer = Adam(lr=lr) monitor_fn = theano.function( inputs=[m_x], outputs=[ m_nll_upper_bound, m_recon_term, m_kl_term, max_phi_sig, mean_phi_sig, min_phi_sig, max_prior_sig, mean_prior_sig, min_prior_sig, max_theta_sig, mean_theta_sig, min_theta_sig, max_x, mean_x, min_x, max_theta_mu, mean_theta_mu, min_theta_mu ], on_unused_input='ignore') extension = [ GradientClipping(batch_size=batch_size, check_nan=1), EpochCount(epoch), Monitoring(freq=monitoring_freq, monitor_fn=monitor_fn, ddout=[ m_nll_upper_bound, m_recon_term, m_kl_term, max_phi_sig, mean_phi_sig, min_phi_sig, max_prior_sig, mean_prior_sig, min_prior_sig, max_theta_sig, mean_theta_sig, min_theta_sig, max_x, mean_x, min_x, max_theta_mu, mean_theta_mu, min_theta_mu ], data=[ Iterator(train_data, m_batch_size, start=0, end=112640), Iterator(valid_data, m_batch_size, start=2040064, end=2152704) ]), Picklize(freq=monitoring_freq, force_save_freq=force_saving_freq, path=save_path), EarlyStopping(freq=monitoring_freq, force_save_freq=force_saving_freq, path=save_path, channel=channel_name), WeightNorm() ] mainloop = Training(name=pkl_name, data=Iterator(train_data, batch_size, start=0, end=2040064), model=model, optimizer=optimizer, cost=nll_upper_bound, outputs=[nll_upper_bound], extension=extension) mainloop.run()
def clip_norm(g, c, n): if c > 0: g = T.switch(T.ge(n, c), g * c / n, g) return g
def elu(z): # https://arxiv.org/pdf/1511.07289v1.pdf return T.switch(T.ge(z, 0), z, T.exp(z) - 1)
def step_rnn(x_t, mask, h_tm1, W, h0): h_tm1 = T.switch(mask, h0, h_tm1) return [ACTIVATION[Ah](x_t + h_tm1.dot(W))]
def logpow(x, m): """ Calculates log(x**m) since m*log(x) will fail when m, x = 0. """ return switch(eq(x, 0) & eq(m, 0), 0, m * log(x))
def apply(self, input_): res = T.switch(input_ > 0, input_, self.leaky_init * input_) return T.switch(T.isnan(res), 0, res)
def leakly_relu(x): return T.switch(T.gt(x, 0.), x, x * np.float32(0.2))
def __init__(self, rng, input, is_train, n_in, n_hidden, n_out, p=0.5, dropout=False, input_p=0.1): #, batch_size=20): #Need input dropout layer if input_p!=None: self.input_layer = drop(input, rng=rng, p=input_p) self.input_layer = T.switch(T.neq(is_train, 0), self.input_layer, input) else: self.input_layer=input param_to_scale = [] #To scale weights to square length of 15 self.layer_0 = HiddenLayer( rng=rng, input=self.input_layer, n_in=n_in, n_out=n_hidden[0], activation=prelu, is_train=is_train, p=p, dropout=dropout ) self.params = self.layer_0.params param_to_scale = param_to_scale + [self.layer_0.params[0]] #Add more layers accordingly layer_number = 1 if len(n_hidden)>1: for layer in n_hidden[1:]: current_hidden_layer = HiddenLayer( rng=rng, input=getattr(self, "layer_" + str(layer_number-1)).output, n_in=n_hidden[layer_number-1], n_out=n_hidden[layer_number], activation=prelu, is_train=is_train, p=p, dropout=dropout ) setattr(self, "layer_" + str(layer_number), current_hidden_layer) self.params = self.params + getattr(self, "layer_" + str(layer_number)).params param_to_scale = param_to_scale + [getattr(self, "layer_" + str(layer_number)).params[0]] layer_number = layer_number + 1 # The logistic regression layer gets as input the hidden units # of the hidden layer self.linearRegressionLayer = LinearRegression( input=getattr(self, "layer_" + str(layer_number-1)).output, n_in=n_hidden[layer_number-1], n_out=n_out, rng=rng #,batch_size=batch_size ) self.params = self.params + self.linearRegressionLayer.params #L1 and L2 regularization self.L1 = ( abs(self.layer_0.W).sum() + abs(self.linearRegressionLayer.W).sum() ) self.L2_sqr = ( (self.layer_0.W ** 2).sum() + (self.linearRegressionLayer.W ** 2).sum() ) # # self.negative_log_likelihood = ( # self.logRegressionLayer.negative_log_likelihood # ) # # self.errors = self.logRegressionLayer.errors # self.pred = self.logRegressionLayer.pred # self.diff = self.logRegressionLayer.diff self.param_to_scale = param_to_scale self.errors = self.linearRegressionLayer.errors self.loss = self.linearRegressionLayer.loss self.NRMSE = self.linearRegressionLayer.NRMSE self.pred = self.linearRegressionLayer.pred self.input = input #KEEP IN MIND THIS IS DIFFERENT THAN self.input_layer!!!
def build_sampler(tparams, options, trng): x = tensor.matrix('x', dtype='int64') xr = x[::-1] n_timesteps = x.shape[0] n_samples = x.shape[1] # word embedding (source), forward and backward emb = tparams['Wemb'][x.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) embr = tparams['Wemb'][xr.flatten()] embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) # encoder proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder') projr = get_layer(options['encoder'])[1](tparams, embr, options, prefix='encoder_r') # concatenate forward and backward rnn hidden states ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1) # get the input for decoder rnn initializer mlp ctx_mean = ctx.mean(0) # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2) init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') print 'Building f_init...', outs = [init_state, ctx] f_init = theano.function([x], outs, name='f_init', profile=profile) print 'Done' # x: 1 x 1 y = tensor.vector('y_sampler', dtype='int64') init_state = tensor.matrix('init_state', dtype='float32') # if it's the first word, emb should be all zero and it is indicated by -1 emb = tensor.switch(y[:, None] < 0, tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), tparams['Wemb_dec'][y]) # apply one step of conditional gru with attention proj = get_layer(options['decoder'])[1](tparams, emb, options, prefix='decoder', mask=None, context=ctx, one_step=True, init_state=init_state) # get the next hidden state next_state = proj[0] # get the weighted averages of context for this target word y ctxs = proj[1] logit_lstm = get_layer('ff')[1](tparams, next_state, options, prefix='ff_logit_lstm', activ='linear') logit_prev = get_layer('ff')[1](tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit_ctx = get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') # compute the softmax probability next_probs = tensor.nnet.softmax(logit) # sample from softmax distribution to get the sample next_sample = trng.multinomial(pvals=next_probs).argmax(1) # compile a function to do the whole thing above, next word probability, # sampled word for the next target, next hidden state to be used print 'Building f_next..', inps = [y, ctx, init_state] outs = [next_probs, next_sample, next_state] f_next = theano.function(inps, outs, name='f_next', profile=profile) print 'Done' return f_init, f_next
def apply(self, input_): input_ = T.switch(T.isnan(input_), 0, input_) return T.switch(input_ > 0, input_, 0.01 * input_)
def __init__(self, is_train, rng, input=1, n_in=1, n_out = 500,W=None, b=None, activation=T.tanh, p=0.5): # type: (object, object, object, object, object, object, object, object, object) -> object """ Hidden unit activation is given by: activation(dot(input,W) + b) :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type is_train: theano.iscalar :param is_train: indicator pseudo-boolean (int) for switching between training and prediction :type input: theano.tensor.dmatrix :param input: a symbolic tensor of shape (n_examples, n_in) :type n_in: int :param n_in: dimensionality of input :type n_out: int :param n_out: number of hidden units :type activation: theano.Op or function :param activation: Non linearity to be applied in the hidden layer :type p: float or double :param p: probability of NOT dropping out a unit """ self.input = input if W is None: W_values = numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W = theano.shared(value=W_values, name='W', borrow=True) if b is None: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) self.W = W self.b = b lin_output = T.dot(input, self.W) + self.b output = activation(lin_output) # multiply output and drop -> in an approximation the scaling effects cancel out train_output = drop(output,p) #is_train is a pseudo boolean theano variable for switching between training and prediction self.output = T.switch(T.neq(is_train, 0), train_output, p*output) # parameters of the model self.params = [self.W, self.b]
def relu(x): return T.switch(T.gt(x, 0.), x, 0.)
def hinge_c(x, y): return T.switch(T.lt(1 - x * y, 0), 0 * x, 1 - x * y)
def train( dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units encoder='gru', decoder='gru_cond', patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 regularization penalty alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=0.01, # learning rate n_words_src=100000, # source vocabulary size n_words=100000, # target vocabulary size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq datasets=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok' ], valid_datasets=[ '../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok' ], dictionaries=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl' ], use_dropout=False, reload_=False): # Model options model_options = locals().copy() # load dictionaries and invert them worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): with open(dd, 'rb') as f: worddicts[ii] = pkl.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk # reload options if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: models_options = pkl.load(f) print 'Loading data' train = TextIterator(datasets[0], datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=batch_size, maxlen=maxlen) valid = TextIterator(valid_datasets[0], valid_datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=valid_batch_size, maxlen=maxlen) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] print 'Buliding sampler' f_init, f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after all regularizers - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 for x, y in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen, n_words_src=n_words_src, n_words=n_words) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask, y, y_mask) # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(numpy.minimum(5, x.shape[1])): stochastic = True sample, score = gen_sample(tparams, f_init, f_next, x[:, jj][:, None], model_options, trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=False) print 'Source ', jj, ': ', for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: print worddicts_r[0][vv], else: print 'UNK', print print 'Truth ', jj, ' : ', for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print print 'Sample ', jj, ': ', if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] for vv in ss: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print 'Valid ', valid_err # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) return valid_err
def quantization(W,Wacc,method, Wb): if method == "FPN": Wb = W elif method == "LAB": L = (T.sqrt(Wacc) + 1e-8) Wb = hard_sigmoid(W) Wb = round3(Wb) Wb = T.cast(T.switch(Wb,1.,-1.), theano.config.floatX) alpha = (T.abs_(L*W).sum()/L.sum()).astype('float32') Wb = alpha*Wb elif method=="LATa": D = (T.sqrt(Wacc) + 1e-8) b = T.sgn(Wb) # compute the threshold, converge within 10 iterations alpha = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') b = T.switch(T.gt(W/alpha, 0.5), 1., T.switch(T.lt(W/alpha, -0.5), -1., 0.) ) def OneStep(alpha, b): # minimize alpha alpha_new = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') # minimize b b_new = T.switch(T.gt(W/alpha_new, 0.5), 1., T.switch(T.lt(W/alpha_new, -0.5), -1., 0.)) delta = T.abs_(alpha_new-alpha) condition = T.lt(delta, 1e-6) return [alpha_new, b_new], theano.scan_module.until(condition) [out1, out2], updates = theano.scan(fn=OneStep ,outputs_info=[alpha, b],n_steps=10) Wb = out1[-1]*out2[-1] elif method=="LATe": D = (T.sqrt(Wacc) + 1e-8) thres = findalpha(D, W) alpha = thres*2 Wt = T.switch(T.gt(W, thres), 1., T.switch(T.lt(W, -thres), -1., 0.) ) Wb = alpha*Wt elif method=="LAT2e": D = (T.sqrt(Wacc) + 1e-8) thres1, thres2 = findalpha2(D, W) alpha1 = thres1*2 Wt1 = T.switch(T.gt(W, thres1), 1., 0.) alpha2 = thres2*2 Wt2 = T.switch(T.lt(W, -thres2), -1., 0.) Wb = alpha1*Wt1 + alpha2*Wt2 elif method=="LAT2a": D = (T.sqrt(Wacc) + 1e-8) b1 = T.ge(Wb,0) alpha1 = (T.abs_(b1*D*W).sum()/T.abs_(b1*D).sum()).astype('float32') b1 = T.switch(T.gt(W/alpha1, 0.5), 1., 0.) # Wb1 = alpha1*mask1*Wb b2 = T.lt(Wb,0) alpha2 = (T.abs_(b2*D*W).sum()/T.abs_(b2*D).sum()).astype('float32') b2 = T.switch(T.lt(W/alpha2, -0.5), -1., 0.) def OneStep(alpha1, b1, alpha2, b2): alpha1_new = (T.abs_(b1*D*W).sum()/T.abs_(b1*D).sum()).astype('float32') b1_new = T.switch(T.gt(W/alpha1_new, 0.5), 1., 0.) alpha2_new = (T.abs_(b2*D*W).sum()/T.abs_(b2*D).sum()).astype('float32') b2_new = T.switch(T.lt(W/alpha2_new, -0.5), -1., 0.) delta1 = T.abs_(alpha1_new-alpha1) delta2 = T.abs_(alpha2_new-alpha2) condition = T.lt(delta1, 1e-6) and T.lt(delta2, 1e-6) return [alpha1_new, b1_new, alpha2_new, b2_new], theano.scan_module.until(condition) [out1, out2, out3, out4], updates = theano.scan(fn=OneStep ,outputs_info=[alpha1, b1, alpha2, b2],n_steps=10) Wb = out1[-1]*out2[-1] + out3[-1]*out4[-1] elif method=="LAQ_linear": D = (T.sqrt(Wacc) + 1e-8) b = T.sgn(Wb) alpha = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') # b = T.switch(T.gt(W/alpha, 0.5), 1., T.switch(T.lt(W/alpha, -0.5), -1., 0.) ) m = 3 # number of bits n = 2**(m-1)-1 b = round3(T.clip(W/alpha, -1., 1.)*n)/(n) def OneStep(alpha, b): # minimize alpha alpha_new = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') # minimize b # b_new = T.switch(T.gt(W/alpha, 0.5), 1., T.switch(T.lt(W/alpha, -0.5), -1., 0.)) b_new = round3(T.clip(W/alpha_new, -1., 1.)*n)/(n) delta = T.abs_(alpha_new-alpha) condition = T.lt(delta, 1e-6) return [alpha_new, b_new], theano.scan_module.until(condition) [out1, out2], updates = theano.scan(fn=OneStep ,outputs_info=[alpha, b],n_steps=10) Wb = out1[-1]*out2[-1] elif method=="LAQ_log": D = (T.sqrt(Wacc) + 1e-8) b = T.sgn(Wb) alpha = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') m = 3 # number of bits n = 2**(m-1)-1 tmp = T.clip(W/alpha, -1., 1.) # log2(1/2*(2^(-n)+2^(-(n+1)))) - (-n-(n+1))/2 = 0.0849625 b = T.switch( T.ge(tmp, pow(2, -n)), T.pow(2, round3(T.log2(tmp)-0.0849625)), T.switch( T.le(tmp, -pow(2,-n)), -T.pow(2, round3(T.log2(-tmp)-0.0849625)), 0.)) b = T.switch(T.ge(b, pow(2, - (n-1))), b, T.switch(T.le(b, -pow(2, -(n-1))), b, T.sgn(b)*pow(2,-(n-1)))) def OneStep(alpha, b): # minimize alpha alpha_new = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') # minimize b tmp_new = T.clip(W/alpha_new, -1., 1.) b_new = T.switch( T.ge(tmp_new, pow(2, -n)), T.pow(2, round3(T.log2(tmp_new)-0.0849625)), T.switch( T.le(tmp_new, -pow(2, -n)), -T.pow(2, round3(T.log2(-tmp_new)-0.0849625)), 0.)) b_new = T.switch(T.ge(b_new, pow(2, - (n-1))), b_new, T.switch(T.le(b_new, -pow(2, -(n-1))), b_new, T.sgn(b_new)*pow(2, -(n-1)))) delta = T.abs_(alpha_new-alpha) condition = T.lt(delta, 1e-6) return [alpha_new, b_new], theano.scan_module.until(condition) [out1, out2], updates = theano.scan(fn=OneStep ,outputs_info=[alpha, b],n_steps=10) Wb = out1[-1]*out2[-1] return Wb
def hmc_updates(positions, stepsize, avg_acceptance_rate, final_pos, accept, target_acceptance_rate, stepsize_inc, stepsize_dec, stepsize_min, stepsize_max, avg_acceptance_slowness): """This function is executed after `n_steps` of HMC sampling (`hmc_move` function). It creates the updates dictionary used by the `simulate` function. It takes care of updating: the position (if the move is accepted), the stepsize (to track a given target acceptance rate) and the average acceptance rate (computed as a moving average). Parameters ---------- positions: shared variable, theano matrix Shared theano matrix whose rows contain the old position stepsize: shared variable, theano scalar Shared theano scalar containing current step size avg_acceptance_rate: shared variable, theano scalar Shared theano scalar containing the current average acceptance rate final_pos: shared variable, theano matrix Shared theano matrix whose rows contain the new position accept: theano scalar Boolean-type variable representing whether or not the proposed HMC move should be accepted or not. target_acceptance_rate: float The stepsize is modified in order to track this target acceptance rate. stepsize_inc: float Amount by which to increment stepsize when acceptance rate is too high. stepsize_dec: float Amount by which to decrement stepsize when acceptance rate is too low. stepsize_min: float Lower-bound on `stepsize`. stepsize_min: float Upper-bound on `stepsize`. avg_acceptance_slowness: float Average acceptance rate is computed as an exponential moving average. (1-avg_acceptance_slowness) is the weight given to the newest observation. Returns ------- rval1: dictionary-like A dictionary of updates to be used by the `HMC_Sampler.simulate` function. The updates target the position, stepsize and average acceptance rate. """ ## POSITION UPDATES ## # broadcast `accept` scalar to tensor with the same dimensions as # final_pos. accept_matrix = accept.dimshuffle(0, *(('x', ) * (final_pos.ndim - 1))) # if accept is True, update to `final_pos` else stay put new_positions = TT.switch(accept_matrix, final_pos, positions) # end-snippet-5 start-snippet-7 ## STEPSIZE UPDATES ## # if acceptance rate is too low, our sampler is too "noisy" and we reduce # the stepsize. If it is too high, our sampler is too conservative, we can # get away with a larger stepsize (resulting in better mixing). _new_stepsize = TT.switch(avg_acceptance_rate > target_acceptance_rate, stepsize * stepsize_inc, stepsize * stepsize_dec) # maintain stepsize in [stepsize_min, stepsize_max] new_stepsize = TT.clip(_new_stepsize, stepsize_min, stepsize_max) # end-snippet-7 start-snippet-6 ## ACCEPT RATE UPDATES ## # perform exponential moving average mean_dtype = theano.scalar.upcast(accept.dtype, avg_acceptance_rate.dtype) new_acceptance_rate = TT.add(avg_acceptance_slowness * avg_acceptance_rate, (1.0 - avg_acceptance_slowness) * accept.mean(dtype=mean_dtype)) # end-snippet-6 start-snippet-8 return [(positions, new_positions), (stepsize, new_stepsize), (avg_acceptance_rate, new_acceptance_rate)]
def relu(x): # Using T.nnet.relu gives me NaNs. No idea why. return T.switch(x > lib.floatX(0), x, lib.floatX(0))
def updates(self): ups = {} add_updates = lambda b: safe_update(ups, b) base_lr = numpy.asarray( self.conf['base_lr_per_example'] / self.conf['batchsize'], floatX) annealing_coef = clip_ramp(self.iter, (self.conf['lr_anneal_start'], base_lr), (self.conf['lr_anneal_end'], 0.0), dtype=floatX) ups[self.iter] = self.iter + 1 ups[self.annealing_coef] = annealing_coef # # Enforcing Sparsity # pos_h = self.rbm.mean_h_given_v(self.visible_batch) sparsity_cost = 0 KL_eps = 1e-4 if self.conf['sparsity_KL_featuretarget_weight']: p = self.conf['sparsity_KL_featuretarget_target'] sparsity_cost = sparsity_cost + tensor.mul( self.conf['sparsity_KL_featuretarget_weight'], -tensor.sum( p * tensor.log(tensor.mean(pos_h, axis=0) + KL_eps) + (1 - p) * tensor.log(1 - tensor.mean(pos_h, axis=0) + KL_eps))) assert sparsity_cost.ndim == 0 assert sparsity_cost.dtype == 'float32' if self.conf['sparsity_KL_exampletarget_weight']: p = self.conf['sparsity_KL_exampletarget_target'] sparsity_cost = sparsity_cost + tensor.mul( self.conf['sparsity_KL_exampletarget_weight'], -tensor.sum( p * tensor.log(tensor.mean(pos_h, axis=1) + KL_eps) + (1 - p) * tensor.log(1 - tensor.mean(pos_h, axis=1) + KL_eps))) assert sparsity_cost.ndim == 0 assert sparsity_cost.dtype == 'float32' # # Updates related to CD # # These updates are for CD-1, PCD/SML, and a stochastic interpolation # between them. # # The idea is to start from negative phase particles neg_v, run them # through a step of Gibbs, and put them into sampler.particles: # # neg_v -> Gibbs -> sampler.particles. # # We control the kind of CD by adjusting what neg_v is: either it is the # visible_batch (for CD-1) or it is the old sampler.particles (PCD). We # can interpolate between the two algorithms by stochastically choosing # either the visible_batch[i] or the old particles[i] on a row-by-row # basis. # if self.conf['CD_anneal_start'] < self.conf['CD_anneal_end']: P_restart = clip_ramp(self.iter, (self.conf['CD_anneal_start'], 1.0), (self.conf['CD_anneal_end'], 0.0), dtype=floatX) reset_decisions = self.sampler.s_rng.uniform( size=(self.conf['batchsize'], )) < P_restart v0_ndim = self.visible_batch.ndim # broadcast reset_decisions over all but batch idx neg_v0 = tensor.switch( reset_decisions.dimshuffle(0, *(['x'] * (v0_ndim - 1))), self.visible_batch, # reset the chain to data self.sampler.particles) # continue old chain else: neg_v0 = self.sampler.particles neg_v1 = self.rbm.gibbs_step_for_v(neg_v0, self.sampler.s_rng) ups[self.sampler.particles] = neg_v1 ## N.B. we are manually advancing the sampler, not calling ## Gibbs.updates() # add_updates(self.sampler.updates()) learn_rates = [ self.annealing_coef * self.lr_dict[p] for p in self.rbm.params() ] add_updates( self.rbm.cd_updates(pos_v=self.visible_batch, neg_v=neg_v1, lr=learn_rates, other_cost=sparsity_cost)) # # Gathering statistics of unit activity # neg_h = self.rbm.mean_h_given_v(neg_v1) self.pos_h_means = sharedX(numpy.zeros(self.rbm.h_shp) + 0.5, 'pos_h') self.neg_h_means = sharedX(numpy.zeros(self.rbm.h_shp) + 0.5, 'neg_h') ups[self. pos_h_means] = 0.1 * pos_h.mean(axis=0) + .9 * self.pos_h_means ups[self. neg_h_means] = 0.1 * neg_h.mean(axis=0) + .9 * self.neg_h_means # Clipping parameters to legal ranges for clipper in self.clippers: ups = clipper.filter_update(ups) return ups
def compute_cost(self, features, features_mask, labels, labels_mask, speaker, start_flag, batch_size, raw_audio=None): if speaker is None: assert not self.use_speaker target_features = features[1:] mask = features_mask[1:] cell_shape = (mask.shape[0], batch_size, self.rnn_h_dim) gat_shape = (mask.shape[0], batch_size, 2 * self.rnn_h_dim) cell_h1 = tensor.zeros(cell_shape, dtype=floatX) cell_h2 = tensor.zeros(cell_shape, dtype=floatX) cell_h3 = tensor.zeros(cell_shape, dtype=floatX) gat_h1 = tensor.zeros(gat_shape, dtype=floatX) gat_h2 = tensor.zeros(gat_shape, dtype=floatX) gat_h3 = tensor.zeros(gat_shape, dtype=floatX) if self.weak_feedback: input_features = features[:-1] if self.feedback_noise_level: noise = self.theano_rng.normal(size=input_features.shape, avg=0., std=1.) input_features += self.noise_level_var * noise out_cell_h1, out_gat_h1 = self.out_to_h1.apply(input_features) to_normalize = [out_cell_h1, out_gat_h1] out_cell_h1, out_gat_h1 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 += out_cell_h1 gat_h1 += out_gat_h1 if self.full_feedback: assert self.weak_feedback out_cell_h2, out_gat_h2 = self.out_to_h2.apply(input_features) out_cell_h3, out_gat_h3 = self.out_to_h3.apply(input_features) to_normalize = [out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3] out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h2 += out_cell_h2 gat_h2 += out_gat_h2 cell_h3 += out_cell_h3 gat_h3 += out_gat_h3 if self.use_speaker: speaker = speaker[:, 0] emb_speaker = self.embed_speaker.apply(speaker) emb_speaker = tensor.shape_padleft(emb_speaker) spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker) spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker) spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker) to_normalize = [ spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3, spk_gat_h3 ] spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \ spk_cell_h3, spk_gat_h3, = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 = spk_cell_h1 + cell_h1 cell_h2 = spk_cell_h2 + cell_h2 cell_h3 = spk_cell_h3 + cell_h3 gat_h1 = spk_gat_h1 + gat_h1 gat_h2 = spk_gat_h2 + gat_h2 gat_h3 = spk_gat_h3 + gat_h3 initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k = \ self.initial_states(batch_size) # If it's a new example, use initial states. input_h1 = tensor.switch(start_flag, initial_h1, last_h1) input_h2 = tensor.switch(start_flag, initial_h2, last_h2) input_h3 = tensor.switch(start_flag, initial_h3, last_h3) input_w = tensor.switch(start_flag, initial_w, last_w) input_k = tensor.switch(start_flag, initial_k, last_k) context_oh = self.encoder.apply(labels) * \ tensor.shape_padright(labels_mask) u = tensor.shape_padleft(tensor.arange(labels.shape[1], dtype=floatX), 2) def step(inp_h1_t, gat_h1_t, inp_h2_t, gat_h2_t, inp_h3_t, gat_h3_t, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, context_oh): attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) inp_h1_t += attinp_h1 gat_h1_t += attgat_h1 h1_t = self.rnn1.apply(inp_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) + self.epsilon k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t) a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) inp_h2_t += attinp_h2 gat_h2_t += attgat_h2 inp_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply(inp_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply(inp_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) return h1_t, h2_t, h3_t, k_t, w_t, phi_t, a_t_ (h1, h2, h3, k, w, phi, pi_att), scan_updates = theano.scan( fn=step, sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3], non_sequences=[context_oh], outputs_info=[ input_h1, input_h2, input_h3, input_k, input_w, None, None ]) h1_out = self.h1_to_readout.apply(h1) h2_out = self.h2_to_readout.apply(h2) h3_out = self.h3_to_readout.apply(h3) to_normalize = [h1_out, h2_out, h3_out] h1_out, h2_out, h3_out = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] readouts = h1_out + h2_out + h3_out if self.use_speaker: readouts += self.speaker_to_readout.apply(emb_speaker) readouts += self.att_to_readout.apply(w) predicted = self.readout_to_output.apply(readouts) if self.which_cost == 'MSE': if self.use_speaker: predicted += self.speaker_to_output.apply(emb_speaker) cost = tensor.sum((predicted - target_features)**2, axis=-1) next_x = predicted # Dummy value for coeff coeff = predicted elif self.which_cost == 'GMM': mu, sigma, coeff = predicted if self.use_speaker: spk_to_out = self.speaker_to_output.apply(emb_speaker) mu += spk_to_out[0] sigma += spk_to_out[1] coeff += spk_to_out[2] # When training there should not be sampling_bias sigma = tensor.exp(sigma) + self.epsilon coeff = tensor.nnet.softmax(coeff.reshape( (-1, self.k_gmm))).reshape(coeff.shape) + self.epsilon cost = cost_gmm(target_features, mu, sigma, coeff) next_x = sample_gmm(mu, sigma, coeff, self.theano_rng) cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag updates = [] updates.append((last_h1, h1[-1])) updates.append((last_h2, h2[-1])) updates.append((last_h3, h3[-1])) updates.append((last_k, k[-1])) updates.append((last_w, w[-1])) cost_raw = None if self.raw_output: raw_mask = tensor.extra_ops.repeat(features_mask, 80, axis=0) raw_mask = raw_mask.dimshuffle(1, 0) # breakpointOp = PdbBreakpoint("Raw mask breakpoint") # condition = tensor.gt(raw_mask.shape[0], 0) # raw_mask = breakpointOp(condition, raw_mask) predicted_transposed = predicted.dimshuffle(1, 0, 2) last_h0, last_big_h0 = self.sampleRnn.initial_states(batch_size) raw_audio_reshaped = raw_audio.dimshuffle(1, 0, 2) raw_audio_reshaped = raw_audio_reshaped.reshape( (raw_audio_reshaped.shape[0], -1)) cost_raw, ip_cost, all_params, ip_params, other_params, new_h0, new_big_h0 =\ self.sampleRnn.apply(raw_audio_reshaped, predicted_transposed, last_h0, last_big_h0, start_flag, raw_mask) if self.sampleRnn.N_RNN == 1: new_h0 = tensor.unbroadcast(new_h0, 1) new_big_h0 = tensor.unbroadcast(new_big_h0, 1) updates.append((last_h0, new_h0)) updates.append((last_big_h0, new_big_h0)) # cost = cost + 80.*cost_raw alpha_ = numpy.float32(0.) beta_ = numpy.float32(1.) cost = alpha_ * cost + beta_ * cost_raw attention_vars = [next_x, k, w, coeff, phi, pi_att] return cost, scan_updates + updates, attention_vars, cost_raw
def out_shape(imgshape, ds, ignore_border=False, st=None): """Return the shape of the output from this op, for input of given shape and flags. :param imgshape: the shape of a tensor of images. The last two elements are interpreted as the number of rows, and the number of cols. :type imgshape: tuple, list, or similar of integer or scalar Theano variable. :param ds: downsample factor over rows and columns this parameter indicates the size of the pooling region :type ds: list or tuple of two ints :param st: the stride size. This is the distance between the pooling regions. If it's set to None, in which case it equlas ds. :type st: list or tuple of two ints :param ignore_border: if ds doesn't divide imgshape, do we include an extra row/col of partial downsampling (False) or ignore it (True). :type ignore_border: bool :rtype: list :returns: the shape of the output from this op, for input of given shape. This will have the same length as imgshape, but with last two elements reduced as per the downsampling & ignore_border flags. """ if len(imgshape) < 2: raise TypeError('imgshape must have at least two elements ' '(rows, cols)') if st is None: st = ds r, c = imgshape[-2:] if ignore_border: out_r = (r - ds[0]) // st[0] + 1 out_c = (c - ds[1]) // st[1] + 1 if isinstance(r, theano.Variable): nr = tensor.maximum(out_r, 0) else: nr = numpy.maximum(out_r, 0) if isinstance(c, theano.Variable): nc = tensor.maximum(out_c, 0) else: nc = numpy.maximum(out_c, 0) else: if isinstance(r, theano.Variable): nr = tensor.switch( tensor.ge(st[0], ds[0]), (r - 1) // st[0] + 1, tensor.maximum(0, (r - 1 - ds[0]) // st[0] + 1) + 1) elif st[0] >= ds[0]: nr = (r - 1) // st[0] + 1 else: nr = max(0, (r - 1 - ds[0]) // st[0] + 1) + 1 if isinstance(c, theano.Variable): nc = tensor.switch( tensor.ge(st[1], ds[1]), (c - 1) // st[1] + 1, tensor.maximum(0, (c - 1 - ds[1]) // st[1] + 1) + 1) elif st[1] >= ds[1]: nc = (c - 1) // st[1] + 1 else: nc = max(0, (c - 1 - ds[1]) // st[1] + 1) + 1 rval = list(imgshape[:-2]) + [nr, nc] return rval
def RMSprop(self, cost, params, full_params, sampled_params, sidxs, epsilon=1e-6): grads = [T.grad(cost=cost, wrt=param) for param in params] sgrads = [T.grad(cost=cost, wrt=sparam) for sparam in sampled_params] updates = OrderedDict() if self.grad_cap > 0: norm = T.cast( T.sqrt( T.sum([ T.sum([T.sum(g**2) for g in g_list]) for g_list in grads ]) + T.sum([T.sum(g**2) for g in sgrads])), theano.config.floatX) grads = [[ T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm, g) for g in g_list ] for g_list in grads] sgrads = [ T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm, g) for g in sgrads ] for p_list, g_list in zip(params, grads): for p, g in zip(p_list, g_list): if self.adapt: if self.adapt == 'adagrad': g = self.adagrad(p, g, updates) if self.adapt == 'rmsprop': g = self.rmsprop(p, g, updates) if self.adapt == 'adadelta': g = self.adadelta(p, g, updates) if self.adapt == 'adam': g = self.adam(p, g, updates) if self.momentum > 0: velocity = theano.shared(p.get_value(borrow=False) * 0., borrow=True) velocity2 = self.momentum * velocity - np.float32( self.learning_rate) * (g + self.lmbd * p) updates[velocity] = velocity2 updates[p] = p + velocity2 else: updates[p] = p * np.float32(1.0 - self.learning_rate * self.lmbd) - np.float32( self.learning_rate) * g for i in range(len(sgrads)): g = sgrads[i] fullP = full_params[i] sample_idx = sidxs[i] sparam = sampled_params[i] if self.adapt: if self.adapt == 'adagrad': g = self.adagrad(fullP, g, updates, sample_idx) if self.adapt == 'rmsprop': g = self.rmsprop(fullP, g, updates, sample_idx) if self.adapt == 'adadelta': g = self.adadelta(fullP, g, updates, sample_idx) if self.adapt == 'adam': g = self.adam(fullP, g, updates, sample_idx) if self.lmbd > 0: delta = np.float32( self.learning_rate) * (g + self.lmbd * sparam) else: delta = np.float32(self.learning_rate) * g if self.momentum > 0: velocity = theano.shared(fullP.get_value(borrow=False) * 0., borrow=True) vs = velocity[sample_idx] velocity2 = self.momentum * vs - delta updates[velocity] = T.set_subtensor(vs, velocity2) updates[fullP] = T.inc_subtensor(sparam, velocity2) else: updates[fullP] = T.inc_subtensor(sparam, -delta) return updates
def clear_nan(x): return T.switch(T.isnan(x), np.float32(0.0), x)
def leaky(self, X): return T.switch(T.ge(X, 0), X, self.leak * X)
def build_sampler(tparams, options, trng): x = tensor.matrix('x', dtype='int64') n_timesteps = x.shape[0] n_samples = x.shape[1] # word embedding (source) emb = tparams['Wemb'][x.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) # encoder proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder') ctx = proj[0][-1] ctx_mean = ctx init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') print 'Building f_init...', outs = [init_state, ctx] f_init = theano.function([x], outs, name='f_init', profile=profile) print 'Done' # y: 1 x 1 y = tensor.vector('y_sampler', dtype='int64') init_state = tensor.matrix('init_state', dtype='float32') # if it's the first word, emb should be all zero emb = tensor.switch(y[:, None] < 0, tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), tparams['Wemb_dec'][y]) # apply one step of gru layer proj = get_layer(options['decoder'])[1](tparams, emb, options, prefix='decoder', mask=None, context=ctx, one_step=True, init_state=init_state) next_state = proj ctxs = ctx # compute the output probability dist and sample logit_lstm = get_layer('ff')[1](tparams, next_state, options, prefix='ff_logit_lstm', activ='linear') logit_prev = get_layer('ff')[1](tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit_ctx = get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') next_probs = tensor.nnet.softmax(logit) next_sample = trng.multinomial(pvals=next_probs).argmax(1) # next word probability print 'Building f_next..', inps = [y, ctx, init_state] outs = [next_probs, next_sample, next_state] f_next = theano.function(inps, outs, name='f_next', profile=profile) print 'Done' return f_init, f_next
def in_test_phase(x, alt): x = T.switch(_LEARNING_PHASE, alt, x) x._uses_learning_phase = True return x
def elu(self, X): return T.switch(T.ge(X, 0), X, self.elu_param * (T.exp(X) - 1))
def switch(condition, then_expression, else_expression): '''condition: scalar tensor. ''' return T.switch(condition, then_expression, else_expression)
best_ppl = np.inf validation_ppl_history = [] gsums = [theano.shared(np.zeros_like(param.get_value(borrow=True))) for param in net.params] cost = net.cost(y) + L2_REG * net.L2_sqr gparams = T.grad(cost, net.params) updates = OrderedDict() # Compute norm of gradients norm = T.sqrt(T.sum([T.sum(gparam**2) for gparam in gparams])) # Adagrad: "Adaptive subgradient methods for online learning and stochastic optimization" (2011) for gparam, param, gsum in zip(gparams, net.params, gsums): gparam = T.switch(T.ge(norm, CLIPPING_THRESHOLD), gparam / norm * CLIPPING_THRESHOLD, gparam) # Clipping of gradients updates[gsum] = gsum + (gparam**2) updates[param] = param - lr * (gparam / (T.sqrt(updates[gsum] + 1e-6))) train_model = theano.function(inputs=[x, y, lr], outputs=cost, updates=updates) validate_model = theano.function(inputs=[x, y], outputs=net.cost(y)) print("Training...") for epoch in range(starting_epoch, MAX_EPOCHS): t0 = time() total_neg_log_likelihood = 0 total_num_output_samples = 0 iteration = 0 for X, Y in get_minibatch(data.TRAIN_FILE, MINIBATCH_SIZE, shuffle=True): total_neg_log_likelihood += train_model(X, Y, learning_rate)