def errors(self, y): idxs=(self.y_pred<0.5).nonzero() y_reg_pred=T.set_subtensor(self.y_pred[idxs], 0) idxs=(y_reg_pred>=0.5).nonzero() y_reg_pred=T.set_subtensor(y_reg_pred[idxs], 1) if y.ndim != y_reg_pred.ndim: raise TypeError( 'y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', y_reg_pred.type) ) return T.mean(T.neq(y_reg_pred, y)) # prec_idxs=(y_reg_pred>0).nonzero() # prec = T.mean(y[prec_idxs]) # reca_idxs=(y>0).nonzero() # reca = T.mean(y_reg_pred[reca_idxs]) # return 2*prec*reca/(prec+reca) # # files='data/images/train' # images,rotated=F.loadImage(files) # images=numpy.array(images) # print images.shape # images=images.astype('float32') # inputs = T.ftensor4('input') # rng = numpy.random.RandomState(1234) # classifier = CDNN( # rng=rng, # batch_size=2, # input=inputs.dimshuffle((0, 3, 1, 2)) # ) # f= theano.function([inputs],classifier.y_pred) # # print f(images[:2]).shape
def get_output(self,y,y_mask,init_state,train=False): X=self.get_input(train) X_mask=self.previous.x_mask X = X.dimshuffle((1, 0, 2)) X_mask = X_mask.dimshuffle((1, 0)) y=y.dimshuffle((1, 0, 2)) y_mask=y_mask.dimshuffle((1, 0)) ### shift 1 sequence backward y_shifted=T.zeros_like(y) y_shifted=T.set_subtensor(y_shifted[1:],y[:-1]) y=y_shifted ### shift 1 sequence backward y_shifted=T.zeros_like(y_mask) y_shifted=T.set_subtensor(y_shifted[1:],y_mask[:-1]) y_mask=y_shifted y_z = T.dot(y, self.W_z) + self.b_z y_r = T.dot(y, self.W_r) + self.b_r y_h = T.dot(y, self.W_h) + self.b_h [h,logit], _ = theano.scan(self._step, sequences = [y,y_z,y_r,y_h,y_mask], outputs_info = [init_state, None], non_sequences=[X,X_mask]) return logit.dimshuffle((1, 0, 2))
def fprop_step_mask(self, state_below, mask, state_before, U): """ Scan function for case using masks Parameters ---------- : todo state_below : TheanoTensor """ g_on = state_below + tensor.dot(state_before[:, :self.dim], U) i_on = tensor.nnet.sigmoid(g_on[:, :self.dim]) f_on = tensor.nnet.sigmoid(g_on[:, self.dim:2*self.dim]) o_on = tensor.nnet.sigmoid(g_on[:, 2*self.dim:3*self.dim]) z = tensor.set_subtensor(state_before[:, self.dim:], f_on * state_before[:, self.dim:] + i_on * tensor.tanh(g_on[:, 3*self.dim:])) z = tensor.set_subtensor(z[:, :self.dim], o_on * tensor.tanh(z[:, self.dim:])) # Only update the state for non-masked data, otherwise # just carry on the previous state until the end z = mask[:, None] * z + (1 - mask[:, None]) * state_before return z
def call(self, X): if type(X) is not list or len(X) != 2: raise Exception("SquareAttention must be called on a list of two tensors. Got: " + str(X)) frame, position = X[0], X[1] # Reshaping the input to exclude the time dimension frameShape = K.shape(frame) positionShape = K.shape(position) (chans, height, width) = frameShape[-3:] targetDim = positionShape[-1] frame = K.reshape(frame, (-1, chans, height, width)) position = K.reshape(position, (-1, ) + (targetDim, )) # Applying the attention hw = THT.abs_(position[:, 2] - position[:, 0]) * self.scale / 2.0 hh = THT.abs_(position[:, 3] - position[:, 1]) * self.scale / 2.0 position = THT.maximum(THT.set_subtensor(position[:, 0], position[:, 0] - hw), -1.0) position = THT.minimum(THT.set_subtensor(position[:, 2], position[:, 2] + hw), 1.0) position = THT.maximum(THT.set_subtensor(position[:, 1], position[:, 1] - hh), -1.0) position = THT.minimum(THT.set_subtensor(position[:, 3], position[:, 3] + hh), 1.0) rX = Data.linspace(-1.0, 1.0, width) rY = Data.linspace(-1.0, 1.0, height) FX = THT.gt(rX, position[:,0].dimshuffle(0,'x')) * THT.le(rX, position[:,2].dimshuffle(0,'x')) FY = THT.gt(rY, position[:,1].dimshuffle(0,'x')) * THT.le(rY, position[:,3].dimshuffle(0,'x')) m = FY.dimshuffle(0, 1, 'x') * FX.dimshuffle(0, 'x', 1) m = m + self.alpha - THT.gt(m, 0.) * self.alpha frame = frame * m.dimshuffle(0, 'x', 1, 2) # Reshaping the frame to include time dimension output = K.reshape(frame, frameShape) return output
def pad(inp, padding): if all([padval == 0 for padval in pyk.flatten(padding)]): return inp if inp.ndim == 4: # Make a zero tensor of the right shape zt = T.zeros(shape=(inp.shape[0], inp.shape[1], inp.shape[2]+sum(padding[0]), inp.shape[3]+sum(padding[1]))) # Compute assignment slice [[ystart, ystop], [xstart, xstop]] = [[padval[0], (-padval[1] if padval[1] != 0 else None)] for padval in padding] # Assign subtensor padded = T.set_subtensor(zt[:, :, ystart:ystop, xstart:xstop], inp) return padded elif inp.ndim == 5: # Make a zero tensor of the right shape zt = T.zeros(shape=(inp.shape[0], inp.shape[1]+sum(padding[2]), inp.shape[2], inp.shape[3]+sum(padding[0]), inp.shape[4]+sum(padding[1]))) # Compute assignment slice [[ystart, ystop], [xstart, xstop], [zstart, zstop]] = [[padval[0], (-padval[1] if padval[1] != 0 else None)] for padval in padding] # Assign subtensor padded = T.set_subtensor(zt[:, zstart:zstop, :, ystart:ystop, xstart:xstop], inp) return padded else: raise NotImplementedError("Padding is only implemented for 4 and 5 dimensional tensors.")
def mask_k_maxpooling(variable, variable_shape ,axis, k): """ Params: variable: tensor2D axis: get k_max_pooling in axis'th dimension k: k loop --> k max value ------ Return: mask : tensor2D 1: if in position k_max 0: else ex variable: 1 2 3 0 0 1 2 7 1 ---> 0 1 0 1 2 1 0 1 0 """ min = -999999999 variable_tmp = variable mask = T.zeros(variable_shape, dtype=theano.config.floatX) for i in range(k): max_idx = T.argmax(variable_tmp,axis=axis) if axis == 0: mask = T.set_subtensor(mask[max_idx,range(0,variable_shape[1])],1) variable_tmp = T.set_subtensor(variable_tmp[max_idx,range(0,variable_shape[1])],min) elif axis == 1: mask = T.set_subtensor(mask[range(0,variable_shape[0]),max_idx],1) variable_tmp = T.set_subtensor(variable_tmp[range(0,variable_shape[0]),max_idx],min) return mask
def pass_edges(input_idx_t, edge_t, edge_mask_t, counter_t, h_tm1, c_tm1, x): h_t = h_tm1 c_t = c_tm1 # select the input vector to use for this edge (source) x_t_i = x[input_idx_t, :] # zero out the input unless this is a leaf node x_t_0 = T.switch(T.eq(T.sum(edge_mask_t), 0), x_t_i, x_t_i*0) # concatenate with the input edge vector x_t_edge = T.concatenate([x_t_0, edge_t]) # compute attention weights, using a manual softmax attention_scores = T.dot(self.v_a, T.tanh(T.dot(self.W_h_a, h_tm1))) # (1, n_edges) # find the max of the unmasked values max_score = T.max(attention_scores + edge_mask_t * 10000.0) - 10000.0 # exponentiate the differences, masking first to avoid inf, and then to keep only relevant scores exp_scores = T.exp((attention_scores - max_score) * edge_mask_t) * edge_mask_t # take the sum, and add one if the mask is all zeros to avoid an inf exp_scores_sum = T.sum(exp_scores) + T.switch(T.eq(T.sum(edge_mask_t), 0), 1.0, 0.0) # normalize to compute the weights weighted_mask = exp_scores / exp_scores_sum i_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_i) + T.sum(T.dot(self.W_h_i.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_i) f_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_f) + T.sum(T.dot(self.W_h_f.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_f) o_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_o) + T.sum(T.dot(self.W_h_o.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_o) u_t = T.tanh(T.dot(x_t_edge, self.W_x_u) + T.sum(T.dot(self.W_h_u.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_u) c_temp = i_t * u_t + f_t * T.sum((weighted_mask * c_tm1).T, axis=0) h_temp = o_t * T.tanh(c_temp) h_t = T.set_subtensor(h_t[:, counter_t], h_temp) c_t = T.set_subtensor(c_t[:, counter_t], c_temp) return h_t, c_t
def fprop(self, XH): # XH is a list of inputs: [state_belows, state_befores] # each state vector is: [state_before; cell_before] # Hence, you use h[:, :self.nout] to compute recurrent term X, H = XH if len(X) != len(self.parent): raise AttributeError("The number of inputs doesn't match " "with the number of parents.") if len(H) != len(self.recurrent): raise AttributeError("The number of inputs doesn't match " "with the number of recurrents.") # The index of self recurrence is 0 z_t = H[0] z = T.zeros((X[0].shape[0], 4 * self.nout)) for x, (parname, parout) in izip(X, self.parent.items()): W = self.params['W_' + parname + '__' + self.name] z += T.dot(x[:, :parout], W) for h, (recname, recout) in izip(H, self.recurrent.items()): U = self.params['U_' + recname + '__' + self.name] z += T.dot(h[:, :recout], U) z += self.params['b_' + self.name] # Compute activations of gating units i_on = T.nnet.sigmoid(z[:, self.nout:2 * self.nout]) f_on = T.nnet.sigmoid(z[:, 2 * self.nout:3 * self.nout]) o_on = T.nnet.sigmoid(z[:, 3 * self.nout:]) # Update hidden & cell states z_t = T.set_subtensor( z_t[:, self.nout:], f_on * z_t[:, self.nout:] + i_on * self.nonlin(z[:, :self.nout])) z_t = T.set_subtensor(z_t[:, :self.nout], o_on * self.nonlin(z_t[:, self.nout:])) z_t.name = self.name return z_t
def crop_images(data, image_shape, border_width=8, mode=0): """ Function used to crop the images by a certain border width. data : input data, theano 4D tensor image_shape : 4-tuple, (batch_size, num_channels, image_rows, image_cols) border_width : border width to be cropped, default value 8 mode : binary, 0 for random, 1 for centered crop. """ if (mode == 0): row_step = image_shape[2] - border_width col_step = image_shape[3] - border_width output = T.alloc(0., image_shape[0], image_shape[1], row_step, col_step) for i in range(image_shape[0]): begin_idx = numpy.random.randint(border_width) output = T.set_subtensor(output[i,:,:,:], data[i,:,begin_idx:(begin_idx+row_step),begin_idx:(begin_idx+col_step)]) return output else: row_step = image_shape[2] - border_width col_step = image_shape[3] - border_width output = T.alloc(0., image_shape[0], image_shape[1], row_step, col_step) for i in range(image_shape[0]): begin_idx = border_width / 2 output = T.set_subtensor(output[i,:,:,:], data[i,:,begin_idx:(begin_idx+row_step),begin_idx:(begin_idx+col_step)]) return output
def output(self, input=None, dropout_active=True, *args, **kwargs): if input == None: input = self.input_layer.output(dropout_active=dropout_active, *args, **kwargs) if dropout_active and (self.dropout > 0.): retain_prob = 1 - self.dropout mask = layers.srng.binomial(input.shape, p=retain_prob, dtype='int32').astype('float32') # apply the input mask and rescale the input accordingly. By doing this it's no longer necessary to rescale the weights at test time. input = input / retain_prob * mask # pad input so the valid convolution amounts to a circular one. # we need to copy (filter_size - stride) values from one side to the other input_padded = T.zeros((input.shape[0], input.shape[1] + self.filter_size - self.stride, input.shape[2], input.shape[3])) input_padded = T.set_subtensor(input_padded[:, :input.shape[1], :, :], input) input_padded = T.set_subtensor(input_padded[:, input.shape[1]:, :, :], input[:, :self.filter_size - self.stride, :, :]) contiguous_input = gpu_contiguous(input_padded) contiguous_filters = gpu_contiguous(self.W) conved = self.filter_acts_op(contiguous_input, contiguous_filters) if self.untie_biases: conved += self.b.dimshuffle(0, 1, 2, 'x') else: conved += self.b.dimshuffle(0, 'x', 'x', 'x') return self.nonlinearity(conved)
def update_log_p(skip_idxs,zeros,active,log_p_curr,log_p_prev): active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()] active_next = T.cast(T.minimum( T.maximum( active + 1, T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1 ), log_p_curr.shape[0] ), 'int32') common_factor = T.max(log_p_prev[:active]) p_prev = T.exp(log_p_prev[:active] - common_factor) _p_prev = zeros[:active_next] # copy over _p_prev = T.set_subtensor(_p_prev[:active], p_prev) # previous transitions _p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1]) # skip transitions _p_prev = T.inc_subtensor( _p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs]) updated_log_p_prev = T.log(_p_prev) + common_factor log_p_next = T.set_subtensor( zeros[:active_next], log_p_curr[:active_next] + updated_log_p_prev ) return active_next, log_p_next
def __init__(self, input): #A 3in1 maxpooling self.output_shape = input.output_shape[0]/2, input.output_shape[1] self.origlayer = input self.output = input.output[::2] self.output = T.set_subtensor(self.output[:input.output.shape[0]/2], T.maximum(self.output[:input.output.shape[0]/2], input.output[1::2])) self.output = T.set_subtensor(self.output[1:], T.maximum(self.output[1:], input.output[1:-1:2]))
def _step(c, c_m, hidden, c_matrix): node_idx = c[:, 0] left_child_idx = c[:, 1] right_child_idx = c[:, 2] all_samples = T.arange(n_samples) recursive = ( T.dot(hidden[left_child_idx, all_samples, :], self.W) + T.dot(hidden[right_child_idx, all_samples, :], self.U) + self.b ) i = T.nnet.sigmoid(_slice(recursive, 0, self.dim_proj)) f1 = T.nnet.sigmoid(_slice(recursive, 1, self.dim_proj)) f2 = T.nnet.sigmoid(_slice(recursive, 2, self.dim_proj)) o = T.nnet.sigmoid(_slice(recursive, 3, self.dim_proj)) c_prime = T.tanh(_slice(recursive, 4, self.dim_proj)) new_c = ( i * c_prime + f1 * c_matrix[left_child_idx, all_samples, :] + f2 * c_matrix[right_child_idx, all_samples, :] ) new_c_masked = c_m[:, None] * new_c + (1.0 - c_m[:, None]) * c_matrix[node_idx, all_samples, :] new_h = o * T.tanh(new_c_masked) new_h_masked = c_m[:, None] * new_h + (1.0 - c_m[:, None]) * hidden[node_idx, all_samples, :] return ( T.set_subtensor(hidden[node_idx, all_samples], new_h_masked), T.set_subtensor(c_matrix[node_idx, all_samples], new_c_masked), )
def T_subspacel1_slow_shrinkage(a,L,lam_sparse,lam_slow,small_value=.001): amp = T.sqrt(a[::2,:]**2 + a[1::2,:]**2 + small_value) #damp = amp[:,1:] - amp[:,:-1] # compose slow shrinkage with subspace l1 shrinkage # slow shrinkage div = T.zeros_like(amp) d1 = amp[:,1:] - amp[:,:-1] d2 = d1[:,1:] - d1[:,:-1] div = T.set_subtensor(div[:,1:-1],-d2) div = T.set_subtensor(div[:,0], -d1[:,0]) div = T.set_subtensor(div[:,-1], d1[:,-1]) slow_amp_shrinkage = 1 - (lam_slow/L)*(div/amp) slow_amp_value = T.switch(T.gt(slow_amp_shrinkage,0),slow_amp_shrinkage,0) slow_shrinkage_prox_a = slow_amp_value*a[::2,:] slow_shrinkage_prox_b = slow_amp_value*a[1::2,:] # subspace l1 shrinkage amp_slow_shrinkage_prox = T.sqrt(slow_shrinkage_prox_a**2 + slow_shrinkage_prox_b**2) #amp_shrinkage = 1. - (lam_slow*lam_sparse/L)*amp_slow_shrinkage_prox amp_shrinkage = 1. - (lam_sparse/L)/amp_slow_shrinkage_prox amp_value = T.switch(T.gt(amp_shrinkage,0.),amp_shrinkage,0.) subspacel1_prox = T.zeros_like(a) subspacel1_prox = T.set_subtensor(subspacel1_prox[ ::2,:],amp_value*slow_shrinkage_prox_a) subspacel1_prox = T.set_subtensor(subspacel1_prox[1::2,:],amp_value*slow_shrinkage_prox_b) return subspacel1_prox
def compile_dream(self, X_train, dream_state, initializer): self.dream_compiled = True X_dream_shape = list(X_train.shape) X_dream_shape[0] = 1 X_dream_shape[1] -= len(dream_state) X_dream = initializer(tuple(X_dream_shape)) self.X_dream = theano.shared(atleast_4d(np.append(dream_state, X_dream).astype('float32'))) current_layer = self.X_dream T.set_subtensor(current_layer[:, len(dream_state):, :], Activations.softmax(current_layer[:, len(dream_state):, :])) for layer, params in zip(self.layers, self.params_shared): current_layer = layer.get_output( current_layer, params, testing=True) y_hat_dream = current_layer.flatten(1) self.optimizer.build([[self.X_dream.get_value()]]) dream_updates = list(self.optimizer.get_updates([self.X_dream], -y_hat_dream[0])) original_var = dream_updates[1][0][:, len(dream_state):, :] new_var = dream_updates[1][1][:, len(dream_state):, :] dream_updates[1] = (self.X_dream, T.set_subtensor(original_var, new_var)) self.dream_update = theano.function( inputs=[], outputs=y_hat_dream, updates=dream_updates )
def update_hard_stack(stack_t, stack_pushed, stack_merged, push_value, merge_value, mask): """Compute the new value of the given hard stack. This performs stack pushes and pops in parallel, and somewhat wastefully. It accepts a precomputed merge result (in `merge_value`) and a precomputed push value `push_value` for all examples, and switches between the two outcomes based on the per-example value of `mask`. Args: stack_t: Current stack value stack_pushed: Helper stack structure, of same size as `stack_t` stack_merged: Helper stack structure, of same size as `stack_t` push_value: Batch of values to be pushed merge_value: Batch of merge results mask: Batch of booleans: 1 if merge, 0 if push """ # Build two copies of the stack batch: one where every stack has received # a push op, and one where every stack has received a merge op. # # Copy 1: Push. stack_pushed = T.set_subtensor(stack_pushed[:, 0], push_value) stack_pushed = T.set_subtensor(stack_pushed[:, 1:], stack_t[:, :-1]) # Copy 2: Merge. stack_merged = T.set_subtensor(stack_merged[:, 0], merge_value) stack_merged = T.set_subtensor(stack_merged[:, 1:-1], stack_t[:, 2:]) # Make sure mask broadcasts over all dimensions after the first. mask = mask.dimshuffle(0, "x", "x") mask = T.cast(mask, dtype=theano.config.floatX) stack_next = mask * stack_merged + (1. - mask) * stack_pushed return stack_next
def T_subspacel1_slow_shrinkage_conv(a, L, lam_sparse, lam_slow, imshp,kshp,featshp,stride=(1,1),small_value=.001): featshp = (imshp[0],kshp[0],featshp[2],featshp[3]) # num images, features, szy, szx features = T.reshape(T.transpose(a),featshp,ndim=4) amp = T.sqrt(features[:,::2,:,:]**2 + features[:,1::2,:,:]**2 + small_value) #damp = amp[:,1:] - amp[:,:-1] # compose slow shrinkage with subspace l1 shrinkage # slow shrinkage div = T.zeros_like(amp) d1 = amp[1:,:,:,:] - amp[:-1,:,:,:] d2 = d1[1:,:,:,:] - d1[:-1,:,:,:] div = T.set_subtensor(div[1:-1,:,:,:], -d2) div = T.set_subtensor(div[0,:,:,:], -d1[0,:,:,:]) div = T.set_subtensor(div[-1,:,:,:], d1[-1,:,:,:]) slow_amp_shrinkage = 1 - (lam_slow / L) * (div / amp) slow_amp_value = T.switch(T.gt(slow_amp_shrinkage, 0), slow_amp_shrinkage, 0) slow_shrinkage_prox_a = slow_amp_value * features[:, ::2, :,:] slow_shrinkage_prox_b = slow_amp_value * features[:,1::2, :,:] # subspace l1 shrinkage amp_slow_shrinkage_prox = T.sqrt(slow_shrinkage_prox_a ** 2 + slow_shrinkage_prox_b ** 2) #amp_shrinkage = 1. - (lam_slow*lam_sparse/L)*amp_slow_shrinkage_prox amp_shrinkage = 1. - (lam_sparse / L) / amp_slow_shrinkage_prox amp_value = T.switch(T.gt(amp_shrinkage, 0.), amp_shrinkage, 0.) subspacel1_prox = T.zeros_like(features) subspacel1_prox = T.set_subtensor(subspacel1_prox[:, ::2, :,:], amp_value * slow_shrinkage_prox_a) subspacel1_prox = T.set_subtensor(subspacel1_prox[:,1::2, :,:], amp_value * slow_shrinkage_prox_b) reshape_subspacel1_prox = T.transpose(T.reshape(subspacel1_prox,(featshp[0],featshp[1]*featshp[2]*featshp[3]),ndim=2)) return reshape_subspacel1_prox
def __setitem__(self, ind, a): ind = self._data_index_(ind) if isinstance(a, psarray_base): assert a.grid is self.grid self._data = T.set_subtensor(self._data[ind], a._data) else: self._data = T.set_subtensor(self._data[ind], a)
def global_contrast_normalize(self, X, scale=1., subtract_mean=True, use_std=False, sqrt_bias=0., min_divisor=1e-8): ndim = X.ndim if not ndim in [3,4]: raise NotImplementedError("X.dim>4 or X.ndim<3") scale = float(scale) mean = X.mean(axis=ndim-1) new_X = X.copy() if subtract_mean: if ndim==3: new_X = X - mean[:,:,None] else: new_X = X - mean[:,:,:,None] if use_std: normalizers = T.sqrt(sqrt_bias + X.var(axis=ndim-1)) / scale else: normalizers = T.sqrt(sqrt_bias + (new_X ** 2).sum(axis=ndim-1)) / scale # Don't normalize by anything too small. T.set_subtensor(normalizers[(normalizers < min_divisor).nonzero()], 1.) if ndim==3: new_X /= normalizers[:,:,None] else: new_X /= normalizers[:,:,:,None] return new_X
def get_odd_even_energy(X, P, H, W, V, U, b, b_0, b_L, d, Lambda, b_p, \ marginalize_visible): h_odd_marginalized = T.set_subtensor(H[:,1::2], \ update_odd_mu(X, P, H, W, V, U, b, b_L)) h_even_marginalized = T.set_subtensor(H[:,::2], \ update_even_mu(X, P, H, W, V, U, b, b_0, b_L)) if marginalize_visible: energy_h_odd_marginalized = get_energy(X, P, h_odd_marginalized, W, V, \ U, b, b_0, b_L, d, Lambda, b_p, \ x_marginalized = "even", \ p_marginalized = "even") energy_h_even_marginalized = get_energy(X, P, h_even_marginalized, W, \ V, U, b, b_0, b_L, d, Lambda, b_p, \ x_marginalized = "odd", \ p_marginalized = "odd") else: energy_h_odd_marginalized = get_energy(X, P, h_odd_marginalized, W, V, \ U, b, b_0, b_L, d, Lambda, b_p, \ x_marginalized = None, \ p_marginalized = None) energy_h_even_marginalized = get_energy(X, P, h_even_marginalized, W, \ V, U, b, b_0, b_L, d, Lambda, b_p, \ x_marginalized = None, \ p_marginalized = None) energy = 0.5*(energy_h_odd_marginalized + energy_h_even_marginalized) return energy
def bbox_transform_inv(boxes, deltas): if boxes.shape[0] == 0: return T.zeros((0, deltas.shape[1]), dtype=deltas.dtype) boxes = boxes.astype(deltas.dtype) widths = boxes[:, 2] - boxes[:, 0] + 1.0 heights = boxes[:, 3] - boxes[:, 1] + 1.0 ctr_x = boxes[:, 0] + 0.5 * widths ctr_y = boxes[:, 1] + 0.5 * heights dx = deltas[:, 0::4] dy = deltas[:, 1::4] dw = deltas[:, 2::4] dh = deltas[:, 3::4] pred_ctr_x = dx * widths.dimshuffle(0,'x') + ctr_x.dimshuffle(0,'x') pred_ctr_y = dy * heights.dimshuffle(0,'x') + ctr_y.dimshuffle(0,'x') pred_w = T.exp(dw) * widths.dimshuffle(0,'x') pred_h = T.exp(dh) * heights.dimshuffle(0,'x') pred_boxes = T.zeros_like(deltas, dtype=deltas.dtype) # x1 pred_boxes = T.set_subtensor(pred_boxes[:, 0::4], pred_ctr_x - 0.5 * pred_w) # y1 pred_boxes = T.set_subtensor(pred_boxes[:, 1::4], pred_ctr_y - 0.5 * pred_h) # x2 pred_boxes = T.set_subtensor(pred_boxes[:, 2::4], pred_ctr_x + 0.5 * pred_w) # y2 pred_boxes = T.set_subtensor(pred_boxes[:, 3::4], pred_ctr_y + 0.5 * pred_h) return pred_boxes
def get_output(self, train=False): X = self.get_input(train) full = T.ones_like(X) masks = [full] for i in xrange(len(self.input_shapes)): mask = T.ones_like(X) idx = 0 for j in xrange(len(self.input_shapes)): if i == j: try: ishape = len(self.input_shapes[0]) except: ishape = [1] pass if len(ishape) == 3: mask = T.set_subtensor(mask[:,:,idx : idx+ self.input_shapes[j]], 0) elif len(ishape) == 2: mask = T.set_subtensor(mask[:,idx : idx+ self.input_shapes[j]], 0) elif len(ishape) == 1: mask = T.set_subtensor(mask[idx : idx+ self.input_shapes[j]], 0) else: raise NotImplementedError() idx = idx + self.input_shapes[j] masks += [mask] masked = T.stack(masks) if train: index = self.trng.random_integers(size=(1,),low = 0, high = len(masks)-1)[0] else: index = 0 masked_output = X * masked[index] return masked_output
def create_adam_updates(updates, params, gparams, gsums, xsums, lr, eps, beta1, beta2): i = theano.shared(np.float64(0.0).astype(theano.config.floatX)) i_t = i + 1.0 omb1_t = 1.0 - beta1**i_t omb2_t = 1.0 - beta2**i_t lr_t = lr * (T.sqrt(omb2_t) / omb1_t) for p, g, m, v in zip(params, gparams, gsums, xsums): if is_subtensor_op(p): origin, indexes = get_subtensor_op_inputs(p) m_sub = m[indexes] v_sub = v[indexes] m_t = beta1*m_sub + (1.0-beta1)*g v_t = beta2*v_sub + (1.0-beta2)*T.sqr(g) g_t = m_t / (T.sqrt(v_t) + eps) updates[m] = T.set_subtensor(m_sub, m_t) updates[v] = T.set_subtensor(v_sub, v_t) updates[origin] = T.inc_subtensor(p, -lr_t*g_t) else: m_t = beta1*m + (1.0-beta1)*g v_t = beta2*v + (1.0-beta2)*T.sqr(g) g_t = m_t / (T.sqrt(v_t) + eps) updates[m] = m_t updates[v] = v_t updates[p] = p - lr_t*g_t updates[i] = i_t
def update_stack(stack_t, shift_value, reduce_value, mask, model_dim): """ Compute the new value of the given stack. This performs stack shifts and reduces in parallel, and somewhat wastefully. It accepts a precomputed reduce result (in `reduce_value`) and a precomputed shift value `shift` for all examples, and switches between the two outcomes based on the per-example value of `mask`. Args: stack_t: Current stack value shift_value: Batch of values to be shifted reduce_value: Batch of reduce results mask: Batch of booleans: 1 if reduce, 0 if shift model_dim: The dimension of shift_value and reduce_value. """ # Build two copies of the stack batch: one where every stack has received # a shift op, and one where every stack has received a reduce op. # Copy 1: Shift. stack_s = T.set_subtensor(stack_t[:, 0, :model_dim], shift_value) stack_s = T.set_subtensor(stack_s[:, 1:], stack_t[:, :-1]) # Copy 2: Reduce. stack_r = T.set_subtensor(stack_t[:, 0, :model_dim], reduce_value) stack_r = T.set_subtensor(stack_r[:, 1:-1], stack_t[:, 2:]) # Make sure mask broadcasts over all dimensions after the first. mask = mask.dimshuffle(0, "x", "x") mask = T.cast(mask, dtype=theano.config.floatX) stack_next = mask * stack_r + (1. - mask) * stack_s return stack_next
def sample_update(self, data): proposal_samples, log_proposal_probs=self.proposal_distrib printing=False if printing: log_transition_probs=theano.printing.Print('1 log transition probs update')(self.true_log_transition_probs(self.current_state, proposal_samples)) log_observation_probs=theano.printing.Print('2 log observation probs update')(self.true_log_observation_probs(proposal_samples, data.dimshuffle('x',0))) log_unnorm_weights=theano.printing.Print('3 log unnorm weights update')(log_transition_probs + log_observation_probs - log_proposal_probs) log_unnorm_weights_center=theano.printing.Print('4 log unnorm weights center update')(log_unnorm_weights-T.max(log_unnorm_weights)) unnorm_weights=theano.printing.Print('5 unnorm weights update')(T.exp(log_unnorm_weights_center)*self.current_weights) normalizer=theano.printing.Print('6 normalizer update')(T.sum(unnorm_weights)) else: log_transition_probs=self.true_log_transition_probs(self.current_state, proposal_samples) log_observation_probs=self.true_log_observation_probs(proposal_samples, data.dimshuffle('x',0)) log_unnorm_weights=log_transition_probs + log_observation_probs - log_proposal_probs log_unnorm_weights_center=log_unnorm_weights-T.max(log_unnorm_weights) unnorm_weights=T.exp(log_unnorm_weights_center)*self.current_weights normalizer=T.sum(unnorm_weights) weights=unnorm_weights/normalizer updates=OrderedDict() updates[self.weights]=T.set_subtensor(self.next_weights, weights) updates[self.particles]=T.set_subtensor(self.next_state, proposal_samples) updates[self.time_counter]=self.time_counter+1 return updates
def create_valid_error(self): #self.valid_error=T.mean(T.abs_(self.predictions - self.pm25target[:,-self.steps:]),axis=0) pred=T.zeros_like(self.predictions) pred=T.set_subtensor(pred[:,0],self.pm25in[:,1,0]+self.pm25target[:,-self.steps+0])#self.predictions[:,0]) for i in xrange(1,self.steps): pred=T.set_subtensor(pred[:,i],pred[:,i-1]+self.pm25target[:,-self.steps+i])#self.predictions[:,i]) self.valid_error=T.mean(T.abs_(pred - self.pm25in[:,-self.steps:,0]),axis=0)
def get_learn_func(self): """ Returns a theano function that takes an action and a reward, and updates the agent based on this experience. """ a = T.iscalar() r = T.scalar() old_estimated_reward = self.estimated_rewards[a] old_observation_count = self.observation_counts[a] observation_count = old_observation_count + 1. delta = r - old_estimated_reward new_estimated_reward = old_estimated_reward + delta / observation_count new_estimated_rewards = T.set_subtensor(self.estimated_rewards[a], new_estimated_reward) new_observation_counts = T.set_subtensor(self.observation_counts[a], observation_count) updates = OrderedDict([ (self.estimated_rewards, new_estimated_rewards), (self.observation_counts, new_observation_counts) ]) rval = function([a, r], updates=updates) return rval
def f_score(self,y,label): #print dir(x) y=T.cast(y,'int32') new_y_pred=T.sub(self.y_pred,label) new_y=T.sub(y,label) pre_pos_num=new_y_pred.shape[0]-new_y_pred.nonzero()[0].shape[0]#预测的正例个数 real_pos=new_y.shape[0]-new_y.nonzero()[0].shape[0] new_y_pred=T.set_subtensor(new_y_pred[new_y_pred.nonzero()[0]],1) new_y=T.set_subtensor(new_y[new_y.nonzero()[0]],2) r=T.neq(new_y_pred,new_y) true_pos=self.y_pred.shape[0]-r.sum() #printed_recall=theano.printing.Print('rec:')(pre_pos_num) #printed=theano.printing.Print('pre:')(real_pos) precision=true_pos / (T.cast(pre_pos_num,'float32')+0.0000001) recall=true_pos / (T.cast(real_pos,'float32')+0.0000001) f_score=(2 * precision * recall) / (precision + recall) return f_score,precision,recall
def pass_edges(input_idx_t, edge_t, edge_mask_t, counter_t, h_tm1, c_tm1, x): h_t = h_tm1 c_t = c_tm1 # select the input vector to use for this edge (source) input = x[input_idx_t, :] # zero out the input unless this is a leaf node input = T.switch(T.eq(T.sum(edge_mask_t), 0), input, input*0) i_t = T.nnet.sigmoid(T.dot(input, self.W_x_i) + T.sum(T.dot(self.W_h_i.T, (edge_mask_t * h_tm1)).T, axis=0) + self.b_h_i) f_t = T.nnet.sigmoid(T.dot(input, self.W_x_f) + T.sum(T.dot(self.W_h_f.T, (edge_mask_t * h_tm1)).T, axis=0) + self.b_h_f) o_t = T.nnet.sigmoid(T.dot(input, self.W_x_o) + T.sum(T.dot(self.W_h_o.T, (edge_mask_t * h_tm1)).T, axis=0) + self.b_h_o) u_t = T.tanh(T.dot(input, self.W_x_u) + T.sum(T.dot(self.W_h_u.T, (edge_mask_t * h_tm1)).T, axis=0) + self.b_h_u) c_temp = i_t * u_t + f_t * T.sum((edge_mask_t * c_tm1).T, axis=0) h_temp = o_t * T.tanh(c_temp) # pass the output of above through another LSTM node for the edge ie_t = T.nnet.sigmoid(T.dot(edge_t, self.W_e_i) + T.dot(h_temp, self.W_eh_i) + self.b_e_i) fe_t = T.nnet.sigmoid(T.dot(edge_t, self.W_e_f) + T.dot(h_temp, self.W_eh_f) + self.b_e_f) oe_t = T.nnet.sigmoid(T.dot(edge_t, self.W_e_o) + T.dot(h_temp, self.W_eh_o) + self.b_e_o) ue_t = T.tanh(T.dot(edge_t, self.W_e_u) + T.dot(h_temp, self.W_eh_u) + self.b_e_u) ce_temp = ie_t * ue_t + fe_t * c_temp he_temp = oe_t * T.tanh(ce_temp) h_t = T.set_subtensor(h_t[:, counter_t], he_temp) c_t = T.set_subtensor(c_t[:, counter_t], ce_temp) return h_t, c_t
def negative_log_likelihood(self, label_sym): """ Return the mean of the negative log-likelihood of the prediction of this model under a given target distribution. :type label_sym: theano.tensor.TensorType :param label_sym: corresponds to a vector that gives for each example the correct label Note: we use the mean instead of the sum so that the learning rate is less dependent on the batch size """ # label_sym.shape[0] is (symbolically) the number of rows in label_sym, i.e., # number of examples (call it n) in the minibatch # T.arange(label_sym.shape[0]) is a symbolic vector which will contain # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of # Log-Probabilities (call it LP) with one row per example and # one column per class LP[T.arange(label_sym.shape[0]),label_sym] is a vector # v containing [LP[0,label_sym[0]], LP[1,label_sym[1]], LP[2,label_sym[2]], ..., # LP[n-1,label_sym[n-1]]] and T.mean(LP[T.arange(label_sym.shape[0]),label_sym]) is # the mean (across minibatch examples) of the elements in v, # i.e., the mean log-likelihood across the minibatch. # loss, matrix \in R[#data,#classes] loss = theano.shared(value=numpy.ones((self.n_data,self.n_classes), dtype=theano.config.floatX), name='cost', borrow=True) T.set_subtensor(loss[T.arange(label_sym.shape[0]),label_sym], 0) #loss = 0 # score, matrix \in R[#data,1] self.score = T.max(loss + self.compatibility, axis=1) margin = T.mean(self.score - self.compatibility[T.arange(label_sym.shape[0]),label_sym]) return self.l2norm + self.C * margin
#GaussianRandomWalkを使う方法と使わない方法どちらも実装しました。 #subtensorの使い方↓ #http://deeplearning.net/software/theano/library/tensor/basic.html #GaussianRandomWalkを使わない方法 with basic_model: #事前分布 s_mu = HalfNormal('s_mu', sd=100) #隣接時刻の状態の誤差 s_Y = HalfNormal('s_Y', sd=100) #各時刻における状態と観測の誤差 mu_0 = Normal('mu_0',mu=0, sd=100) #初期状態 #誤差項 e_mu = Normal('e_mu', mu=0, sd=s_mu, shape =n_times-1) mu = tt.zeros((n_times)) mu = tt.set_subtensor(mu[0], mu_0) for i in range(n_times-1): mu = tt.set_subtensor(mu[i+1], mu[i]+e_mu[i]) #likelihood Y_obs = Normal('Y_obs', mu=mu, sd=s_Y, observed=Y) #サンプリング trace = sample(1000) summary(trace) #GaussianRandomWalkを使う方法 with basic_model: #事前分布 s_mu = HalfNormal('s_mu', sd=100) #隣接時刻の状態の誤差 s_Y = HalfNormal('s_Y', sd=100) #各時刻における状態と観測の誤差
def isoneutral_diffusion_pre(maskT, maskU, maskV, maskW, dxt, dxu, dyt, dyu, dzt, dzw, cost, cosu, salt, temp, zt, K_iso, K_11, K_22, K_33, Ai_ez, Ai_nz, Ai_bx, Ai_by): """ Isopycnal diffusion for tracer following functional formulation by Griffies et al Code adopted from MOM2.1 """ epsln = 1e-20 iso_slopec = 1e-3 iso_dslope = 1e-3 K_iso_steep = 50. tau = 0 dTdx = T.zeros_like(K_11) dSdx = T.zeros_like(K_11) dTdy = T.zeros_like(K_11) dSdy = T.zeros_like(K_11) dTdz = T.zeros_like(K_11) dSdz = T.zeros_like(K_11) """ drho_dt and drho_ds at centers of T cells """ drdT = maskT * get_drhodT(salt[:, :, :, tau], temp[:, :, :, tau], abs(zt)) drdS = maskT * get_drhodS(salt[:, :, :, tau], temp[:, :, :, tau], abs(zt)) """ gradients at top face of T cells """ dTdz = T.set_subtensor(dTdz[:, :, :-1], maskW[:, :, :-1] * (temp[:, :, 1:, tau] - temp[:, :, :-1, tau]) / \ dzw[:, :, :-1] ) dSdz = T.set_subtensor(dSdz[:, :, :-1], maskW[:, :, :-1] * (salt[:, :, 1:, tau] - salt[:, :, :-1, tau]) / \ dzw[:, :, :-1] ) """ gradients at eastern face of T cells """ dTdx = T.set_subtensor( dTdx[:-1, :, :], maskU[:-1, :, :] * (temp[1:, :, :, tau] - temp[:-1, :, :, tau]) / (dxu[:-1, :, :] * cost[:, :, :])) dSdx = T.set_subtensor( dSdx[:-1, :, :], maskU[:-1, :, :] * (salt[1:, :, :, tau] - salt[:-1, :, :, tau]) / (dxu[:-1, :, :] * cost[:, :, :])) """ gradients at northern face of T cells """ dTdy = T.set_subtensor(dTdy[:, :-1, :], maskV[:, :-1, :] * (temp[:, 1:, :, tau] - temp[:, :-1, :, tau]) \ / dyu[:, :-1, :] ) dSdy = T.set_subtensor(dSdy[:, :-1, :], maskV[:, :-1, :] * (salt[:, 1:, :, tau] - salt[:, :-1, :, tau]) \ / dyu[:, :-1, :] ) def dm_taper(sx): """ tapering function for isopycnal slopes """ return 0.5 * (1. + T.tanh((-abs(sx) + iso_slopec) / iso_dslope)) """ Compute Ai_ez and K11 on center of east face of T cell. """ diffloc = T.zeros_like(K_11) diffloc = T.set_subtensor( diffloc[1:-2, 2:-2, 1:], 0.25 * (K_iso[1:-2, 2:-2, 1:] + K_iso[1:-2, 2:-2, :-1] + K_iso[2:-1, 2:-2, 1:] + K_iso[2:-1, 2:-2, :-1])) diffloc = T.set_subtensor( diffloc[1:-2, 2:-2, 0], 0.5 * (K_iso[1:-2, 2:-2, 0] + K_iso[2:-1, 2:-2, 0])) sumz = T.zeros_like(K_11)[1:-2, 2:-2] for kr in range(2): ki = 0 if kr == 1 else 1 for ip in range(2): drodxe = drdT[1 + ip:-2 + ip, 2:-2, ki:] * dTdx[1:-2, 2:-2, ki:] \ + drdS[1 + ip:-2 + ip, 2:-2, ki:] * dSdx[1:-2, 2:-2, ki:] drodze = drdT[1 + ip:-2 + ip, 2:-2, ki:] * dTdz[1 + ip:-2 + ip, 2:-2, :-1 + kr or None] \ + drdS[1 + ip:-2 + ip, 2:-2, ki:] * \ dSdz[1 + ip:-2 + ip, 2:-2, :-1 + kr or None] sxe = -drodxe / (T.minimum(0., drodze) - epsln) taper = dm_taper(sxe) sumz = T.inc_subtensor( sumz[:, :, ki:], dzw[:, :, :-1 + kr or None] * maskU[1:-2, 2:-2, ki:] * T.maximum(K_iso_steep, diffloc[1:-2, 2:-2, ki:] * taper)) Ai_ez = T.set_subtensor(Ai_ez[1:-2, 2:-2, ki:, ip, kr], taper * sxe * maskU[1:-2, 2:-2, ki:]) K_11 = T.set_subtensor(K_11[1:-2, 2:-2, :], sumz / (4. * dzt[:, :, :])) """ Compute Ai_nz and K_22 on center of north face of T cell. """ diffloc = T.set_subtensor(diffloc[...], 0) diffloc = T.set_subtensor( diffloc[2:-2, 1:-2, 1:], 0.25 * (K_iso[2:-2, 1:-2, 1:] + K_iso[2:-2, 1:-2, :-1] + K_iso[2:-2, 2:-1, 1:] + K_iso[2:-2, 2:-1, :-1])) diffloc = T.set_subtensor( diffloc[2:-2, 1:-2, 0], 0.5 * (K_iso[2:-2, 1:-2, 0] + K_iso[2:-2, 2:-1, 0])) sumz = T.zeros_like(K_11)[2:-2, 1:-2] for kr in range(2): ki = 0 if kr == 1 else 1 for jp in range(2): drodyn = drdT[2:-2, 1 + jp:-2 + jp, ki:] * dTdy[2:-2, 1:-2, ki:] + \ drdS[2:-2, 1 + jp:-2 + jp, ki:] * dSdy[2:-2, 1:-2, ki:] drodzn = drdT[2:-2, 1 + jp:-2 + jp, ki:] * dTdz[2:-2, 1 + jp:-2 + jp, :-1 + kr or None] \ + drdS[2:-2, 1 + jp:-2 + jp, ki:] * \ dSdz[2:-2, 1 + jp:-2 + jp, :-1 + kr or None] syn = -drodyn / (T.minimum(0., drodzn) - epsln) taper = dm_taper(syn) sumz = T.inc_subtensor( sumz[:, :, ki:], dzw[:, :, :-1 + kr or None] * maskV[2:-2, 1:-2, ki:] * T.maximum(K_iso_steep, diffloc[2:-2, 1:-2, ki:] * taper)) Ai_nz = T.set_subtensor(Ai_nz[2:-2, 1:-2, ki:, jp, kr], taper * syn * maskV[2:-2, 1:-2, ki:]) K_22 = T.set_subtensor(K_22[2:-2, 1:-2, :], sumz / (4. * dzt[:, :, :])) """ compute Ai_bx, Ai_by and K33 on top face of T cell. """ sumx = T.zeros_like(K_11)[2:-2, 2:-2, :-1] sumy = T.zeros_like(K_11)[2:-2, 2:-2, :-1] for kr in range(2): drodzb = drdT[2:-2, 2:-2, kr:-1 + kr or None] * dTdz[2:-2, 2:-2, :-1] \ + drdS[2:-2, 2:-2, kr:-1 + kr or None] * dSdz[2:-2, 2:-2, :-1] # eastward slopes at the top of T cells for ip in range(2): drodxb = drdT[2:-2, 2:-2, kr:-1 + kr or None] * dTdx[1 + ip:-3 + ip, 2:-2, kr:-1 + kr or None] \ + drdS[2:-2, 2:-2, kr:-1 + kr or None] * \ dSdx[1 + ip:-3 + ip, 2:-2, kr:-1 + kr or None] sxb = -drodxb / (T.minimum(0., drodzb) - epsln) taper = dm_taper(sxb) sumx += dxu[1 + ip:-3 + ip, :, :] * \ K_iso[2:-2, 2:-2, :-1] * taper * \ sxb**2 * maskW[2:-2, 2:-2, :-1] Ai_bx = T.set_subtensor(Ai_bx[2:-2, 2:-2, :-1, ip, kr], taper * sxb * maskW[2:-2, 2:-2, :-1]) # northward slopes at the top of T cells for jp in range(2): facty = cosu[:, 1 + jp:-3 + jp] * dyu[:, 1 + jp:-3 + jp] drodyb = drdT[2:-2, 2:-2, kr:-1 + kr or None] * dTdy[2:-2, 1 + jp:-3 + jp, kr:-1 + kr or None] \ + drdS[2:-2, 2:-2, kr:-1 + kr or None] * \ dSdy[2:-2, 1 + jp:-3 + jp, kr:-1 + kr or None] syb = -drodyb / (T.minimum(0., drodzb) - epsln) taper = dm_taper(syb) sumy += facty * K_iso[2:-2, 2:-2, :-1] \ * taper * syb**2 * maskW[2:-2, 2:-2, :-1] Ai_by = T.set_subtensor(Ai_by[2:-2, 2:-2, :-1, jp, kr], taper * syb * maskW[2:-2, 2:-2, :-1]) K_33 = T.set_subtensor( K_33[2:-2, 2:-2, :-1], sumx / (4 * dxt[2:-2, :, :]) + sumy / (4 * dyt[:, 2:-2, :] * cost[:, 2:-2, :])) K_33 = T.set_subtensor(K_33[2:-2, 2:-2, -1], 0.) return K_11, K_22, K_33, Ai_ez, Ai_nz, Ai_bx, Ai_by
def build_model(tparams, options): opt_ret = OrderedDict() decoder_type = options['decoder_type'] trng = RandomStreams(numpy.random.RandomState(numpy.random.randint(1024)).randint(numpy.iinfo(numpy.int32).max)) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples x = tensor.matrix('x', dtype='int64') x_mask = tensor.matrix('x_mask', dtype='float32') y = tensor.matrix('y', dtype='int64') y_mask = tensor.matrix('y_mask', dtype='float32') x.tag.test_value = numpy.zeros((5, 63), dtype='int64') x_mask.tag.test_value = numpy.ones((5, 63), dtype='float32') y.tag.test_value = numpy.zeros((7, 63), dtype='int64') y_mask.tag.test_value = numpy.ones((7, 63), dtype='float32') xr = x[::-1] xr_mask = x_mask[::-1] n_samples = x.shape[1] n_timesteps = x.shape[0] n_timesteps_trg = y.shape[0] # word embedding for forward RNN (source) emb = tparams['Wemb'][x.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word_src']]) # word embedding for backward RNN (source) embr = tparams['Wemb'][xr.flatten()] embr = embr.reshape([n_timesteps, n_samples, options['dim_word_src']]) # pass through gru layer, recurrence here proj = get_layer('gru')[1](tparams, emb, options, prefix='encoder', mask=x_mask) projr = get_layer('gru')[1](tparams, embr, options, prefix='encoderr', mask=xr_mask) # context ctx = concatenate([proj, projr[::-1]], axis=proj.ndim-1) # context mean ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] # initial decoder state init_state_char = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_init_state_char', activ='tanh') init_state_word = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_init_state_word', activ='tanh') init_bound_char = tensor.zeros_like(init_state_char) init_bound_word = tensor.zeros_like(init_state_word) # word embedding and shifting for targets yemb = tparams['Wemb_dec'][y.flatten()] yemb = yemb.reshape([n_timesteps_trg, n_samples, options['dim_word']]) yemb_shited = tensor.zeros_like(yemb) yemb_shited = tensor.set_subtensor(yemb_shited[1:], yemb[:-1]) yemb = yemb_shited #For the planning [char_h, word_h, bound_c, bound_w, ctxs, alphas, probs, samples, commit_origin, probs_origin, action_plans, temp], updates = \ get_layer(decoder_type)[1](tparams, yemb, options, prefix='decoder', mask=y_mask, context=ctx, context_mask=x_mask, one_step=False, init_state_char=init_state_char, init_state_word=init_state_word, init_bound_char=init_bound_char, init_bound_word=init_bound_word) opt_ret['bound_c'] = bound_c opt_ret['bound_w'] = bound_w opt_ret['dec_alphas'] = alphas #Francis #Our probabilities correspond to the non-shift version. opt_ret['dec_probs'] = probs_origin opt_ret['dec_samples'] = commit_origin opt_ret['dec_commits'] = samples opt_ret['dec_updates'] = updates opt_ret['dec_action_plans'] = action_plans opt_ret['dec_temperature'] = temp.mean() # compute word probabilities logit_rnn = get_layer('fff')[1](tparams, char_h, word_h, options, prefix='ff_logit_rnn', activ='linear') logit_prev = get_layer('ff')[1](tparams, yemb, options, prefix='ff_logit_prev', activ='linear') logit_ctx = get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tensor.tanh(logit_rnn + logit_prev + logit_ctx) if options['use_dropout']: print 'Using dropout' logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) # cost y_flat = y.flatten() y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat cost = -tensor.log(probs.flatten()[y_flat_idx]) cost = cost.reshape([y.shape[0], y.shape[1]]) cost = (cost * y_mask).sum(0) return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost
def from_onehot_sym(x_var): ret = TT.zeros((x_var.shape[0], ), x_var.dtype) nonzero_n, nonzero_a = TT.nonzero(x_var)[:2] ret = TT.set_subtensor(ret[nonzero_n], nonzero_a.astype('uint8')) return ret
pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) #define model architecture index = T.lscalar() x = T.matrix('x') y = T.ivector('y') Words = theano.shared(value=U, name="Words") zero_vec_tensor = T.vector() zero_vec = np.zeros(img_w) set_zero = theano.function([zero_vec_tensor], updates=[ (Words, T.set_subtensor(Words[0, :], zero_vec_tensor)) ], allow_input_downcast=True) layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0], 1, x.shape[1], Words.shape[1])) conv_layers = [] layer1_inputs = [] for i in xrange(len(filter_hs)): filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, img_h, img_w), filter_shape=filter_shape, poolsize=pool_size,
def __init__(self, data, U, img_h=160, img_w=300, hidden_size=100, batch_size=50, lr=0.001, lr_decay=0.95, sqr_norm_lim=9, fine_tune_W=True, fine_tune_M=False, optimizer='adam', filter_sizes=[3, 4, 5], num_filters=100, conv_attn=False, encoder='rnn', elemwise_sum=True, corr_penalty=0.0, xcov_penalty=0.0, n_recurrent_layers=1, is_bidirectional=False): self.data = data self.img_h = img_h self.batch_size = batch_size self.fine_tune_W = fine_tune_W self.fine_tune_M = fine_tune_M self.lr = lr self.lr_decay = lr_decay self.optimizer = optimizer self.sqr_norm_lim = sqr_norm_lim self.conv_attn = conv_attn index = T.iscalar() c = T.imatrix('c') r = T.imatrix('r') y = T.ivector('y') c_mask = T.fmatrix('c_mask') r_mask = T.fmatrix('r_mask') c_seqlen = T.ivector('c_seqlen') r_seqlen = T.ivector('r_seqlen') embeddings = theano.shared(U, name='embeddings', borrow=True) zero_vec_tensor = T.fvector() self.zero_vec = np.zeros(img_w, dtype=theano.config.floatX) self.set_zero = theano.function([zero_vec_tensor], updates=[(embeddings, T.set_subtensor( embeddings[0, :], zero_vec_tensor))]) if encoder.find('cnn') > -1 and ( encoder.find('rnn') > -1 or encoder.find('lstm') > -1) and not elemwise_sum: self.M = theano.shared(np.eye(2 * hidden_size).astype( theano.config.floatX), borrow=True) else: self.M = theano.shared(np.eye(hidden_size).astype( theano.config.floatX), borrow=True) c_input = embeddings[c.flatten()].reshape( (c.shape[0], c.shape[1], embeddings.shape[1])) r_input = embeddings[r.flatten()].reshape( (r.shape[0], r.shape[1], embeddings.shape[1])) l_in = lasagne.layers.InputLayer(shape=(batch_size, img_h, img_w)) if encoder.find('cnn') > -1: l_conv_in = lasagne.layers.ReshapeLayer(l_in, shape=(batch_size, 1, img_h, img_w)) conv_layers = [] for filter_size in filter_sizes: conv_layer = lasagne.layers.Conv2DLayer( l_conv_in, num_filters=num_filters, filter_size=(filter_size, img_w), stride=(1, 1), nonlinearity=lasagne.nonlinearities.rectify, border_mode='valid') pool_layer = lasagne.layers.MaxPool2DLayer( conv_layer, pool_size=(img_h - filter_size + 1, 1)) conv_layers.append(pool_layer) l_conv = lasagne.layers.ConcatLayer(conv_layers) l_conv = lasagne.layers.DenseLayer( l_conv, num_units=hidden_size, nonlinearity=lasagne.nonlinearities.tanh) if is_bidirectional: if encoder.find('lstm') > -1: prev_fwd, prev_bck = l_in, l_in for _ in xrange(n_recurrent_layers): l_fwd = lasagne.layers.LSTMLayer(prev_fwd, hidden_size, backwards=False, learn_init=True, peepholes=True) l_bck = lasagne.layers.LSTMLayer(prev_bck, hidden_size, backwards=True, learn_init=True, peepholes=True) prev_fwd, prev_bck = l_fwd, l_bck else: prev_fwd, prev_bck = l_in, l_in for _ in xrange(n_recurrent_layers): l_fwd = lasagne.layers.RecurrentLayer( prev_fwd, hidden_size, nonlinearity=lasagne.nonlinearities.tanh, W_hid_to_hid=lasagne.init.Orthogonal(), W_in_to_hid=lasagne.init.Orthogonal(), backwards=False, learn_init=True) l_bck = lasagne.layers.RecurrentLayer( prev_bck, hidden_size, nonlinearity=lasagne.nonlinearities.tanh, W_hid_to_hid=lasagne.init.Orthogonal(), W_in_to_hid=lasagne.init.Orthogonal(), backwards=True, learn_init=True) prev_fwd, prev_bck = l_fwd, l_bck l_recurrent = lasagne.layers.ConcatLayer([l_fwd, l_bck]) else: prev_fwd = l_in if encoder.find('lstm') > -1: for _ in xrange(n_recurrent_layers): l_recurrent = lasagne.layers.LSTMLayer(prev_fwd, hidden_size, backwards=False, learn_init=True, peepholes=True) prev_fwd = l_recurrent else: for _ in xrange(n_recurrent_layers): l_recurrent = lasagne.layers.RecurrentLayer( prev_fwd, hidden_size, nonlinearity=lasagne.nonlinearities.tanh, W_hid_to_hid=lasagne.init.Orthogonal(), W_in_to_hid=lasagne.init.Orthogonal(), backwards=False, learn_init=True) prev_fwd = l_recurrent recurrent_size = hidden_size * 2 if is_bidirectional else hidden_size if conv_attn: l_rconv_in = lasagne.layers.InputLayer(shape=(batch_size, img_h, recurrent_size)) l_rconv_in = lasagne.layers.ReshapeLayer(l_rconv_in, shape=(batch_size, 1, img_h, recurrent_size)) conv_layers = [] for filter_size in filter_sizes: conv_layer = lasagne.layers.Conv2DLayer( l_rconv_in, num_filters=num_filters, filter_size=(filter_size, recurrent_size), stride=(1, 1), nonlinearity=lasagne.nonlinearities.rectify, border_mode='valid') pool_layer = lasagne.layers.MaxPool2DLayer( conv_layer, pool_size=(img_h - filter_size + 1, 1)) conv_layers.append(pool_layer) l_hidden1 = lasagne.layers.ConcatLayer(conv_layers) l_hidden2 = lasagne.layers.DenseLayer( l_hidden1, num_units=hidden_size, nonlinearity=lasagne.nonlinearities.tanh) l_out = l_hidden2 else: l_out = l_recurrent if conv_attn: e_context = l_recurrent.get_output(c_input, mask=c_mask, deterministic=False) e_response = l_recurrent.get_output(r_input, mask=r_mask, deterministic=False) def step_fn(row_t, mask_t): return row_t * mask_t.reshape((-1, 1)) if is_bidirectional: e_context, _ = theano.scan(step_fn, outputs_info=None, sequences=[ e_context, T.concatenate([c_mask, c_mask], axis=1) ]) e_response, _ = theano.scan(step_fn, outputs_info=None, sequences=[ e_response, T.concatenate([r_mask, r_mask], axis=1) ]) else: e_context, _ = theano.scan(step_fn, outputs_info=None, sequences=[e_context, c_mask]) e_response, _ = theano.scan(step_fn, outputs_info=None, sequences=[e_response, r_mask]) e_context = l_out.get_output(e_context, mask=c_mask, deterministic=False) e_response = l_out.get_output(e_response, mask=r_mask, deterministic=False) else: e_context = l_out.get_output( c_input, mask=c_mask, deterministic=False)[T.arange(batch_size), c_seqlen].reshape( (c.shape[0], hidden_size)) e_response = l_out.get_output( r_input, mask=r_mask, deterministic=False)[T.arange(batch_size), r_seqlen].reshape( (r.shape[0], hidden_size)) if encoder.find('cnn') > -1: e_conv_context = l_conv.get_output(c_input, deterministic=False) e_conv_response = l_conv.get_output(r_input, deterministic=False) if encoder.find('rnn') > -1 or encoder.find('lstm') > -1: if elemwise_sum: e_context = e_context + e_conv_context e_response = e_response + e_conv_response else: e_context = T.concatenate([e_context, e_conv_context], axis=1) e_response = T.concatenate([e_response, e_conv_response], axis=1) # penalize correlation if abs(corr_penalty) > 0: cor = [] for i in range(hidden_size if elemwise_sum else 2 * hidden_size): y1, y2 = e_context, e_response x1 = y1[:, i] - (np.ones(batch_size) * (T.sum(y1[:, i]) / batch_size)) x2 = y2[:, i] - (np.ones(batch_size) * (T.sum(y2[:, i]) / batch_size)) nr = T.sum(x1 * x2) / (T.sqrt(T.sum(x1 * x1)) * T.sqrt(T.sum(x2 * x2))) cor.append(-nr) if abs(xcov_penalty) > 0: e_context_mean = T.mean(e_context, axis=0, keepdims=True) e_response_mean = T.mean(e_response, axis=0, keepdims=True) e_context_centered = e_context - e_context_mean # (n, i) e_response_centered = e_response - e_response_mean # (n, j) outer_prod = (e_context_centered.dimshuffle(0, 1, 'x') * e_response_centered.dimshuffle(0, 'x', 1) ) # (n, i, j) xcov = T.sum(T.sqr(T.mean(outer_prod, axis=0))) else: e_context = e_conv_context e_response = e_conv_response dp = T.batched_dot(e_context, T.dot(e_response, self.M.T)) #dp = pp('dp')(dp) o = T.nnet.sigmoid(dp) o = T.clip(o, 1e-7, 1.0 - 1e-7) self.shared_data = {} for key in ['c', 'r']: self.shared_data[key] = theano.shared( np.zeros((batch_size, img_h), dtype=np.int32)) for key in ['c_mask', 'r_mask']: self.shared_data[key] = theano.shared( np.zeros((batch_size, img_h), dtype=theano.config.floatX)) for key in ['y', 'c_seqlen', 'r_seqlen']: self.shared_data[key] = theano.shared( np.zeros((batch_size, ), dtype=np.int32)) self.probas = T.concatenate([(1 - o).reshape( (-1, 1)), o.reshape((-1, 1))], axis=1) self.pred = T.argmax(self.probas, axis=1) self.errors = T.sum(T.neq(self.pred, y)) self.cost = T.nnet.binary_crossentropy(o, y).mean() if encoder.find('cnn') > -1 and (encoder.find('rnn') > -1 or encoder.find('lstm') > -1): if abs(corr_penalty) > 0: self.cost += corr_penalty * T.sum(cor) if abs(xcov_penalty) > 0: self.cost += xcov_penalty * xcov self.l_out = l_out self.l_recurrent = l_recurrent self.embeddings = embeddings self.c = c self.r = r self.y = y self.c_seqlen = c_seqlen self.r_seqlen = r_seqlen self.c_mask = c_mask self.r_mask = r_mask self.update_params()
def conv3d(signals, filters, signals_shape=None, filters_shape=None, border_mode="valid"): """ Convolve spatio-temporal filters with a movie. It flips the filters. Parameters ---------- signals Timeseries of images whose pixels have color channels. Shape: [Ns, Ts, C, Hs, Ws]. filters Spatio-temporal filters. Shape: [Nf, Tf, C, Hf, Wf]. signals_shape None or a tuple/list with the shape of signals. filters_shape None or a tuple/list with the shape of filters. border_mode One of 'valid', 'full' or 'half'. Notes ----- Another way to define signals: (batch, time, in channel, row, column) Another way to define filters: (out channel,time,in channel, row, column) For the GPU, use nnet.conv3d. See Also -------- Someone made a script that shows how to swap the axes between both 3d convolution implementations in Theano. See the last `attachment <https://groups.google.com/d/msg/theano-users/1S9_bZgHxVw/0cQR9a4riFUJ>`_ """ if isinstance(border_mode, str): border_mode = (border_mode, border_mode, border_mode) if signals_shape is None: _signals_shape_5d = signals.shape else: _signals_shape_5d = signals_shape if filters_shape is None: _filters_shape_5d = filters.shape else: _filters_shape_5d = filters_shape Ns, Ts, C, Hs, Ws = _signals_shape_5d Nf, Tf, C, Hf, Wf = _filters_shape_5d _signals_shape_4d = (Ns * Ts, C, Hs, Ws) _filters_shape_4d = (Nf * Tf, C, Hf, Wf) if border_mode[1] != border_mode[2]: raise NotImplementedError("height and width bordermodes must match") conv2d_signal_shape = _signals_shape_4d conv2d_filter_shape = _filters_shape_4d if signals_shape is None: conv2d_signal_shape = None if filters_shape is None: conv2d_filter_shape = None out_4d = tensor.nnet.conv2d( signals.reshape(_signals_shape_4d), filters.reshape(_filters_shape_4d), input_shape=conv2d_signal_shape, filter_shape=conv2d_filter_shape, border_mode=border_mode[1], ) # ignoring border_mode[2] # compute the intended output size if border_mode[1] == "valid": Hout = Hs - Hf + 1 Wout = Ws - Wf + 1 elif border_mode[1] == "full": Hout = Hs + Hf - 1 Wout = Ws + Wf - 1 elif border_mode[1] == "half": Hout = Hs - (Hf % 2) + 1 Wout = Ws - (Wf % 2) + 1 elif border_mode[1] == "same": raise NotImplementedError() else: raise ValueError("invalid border mode", border_mode[1]) # reshape the temporary output to restore its original size out_tmp = out_4d.reshape((Ns, Ts, Nf, Tf, Hout, Wout)) # now sum out along the Tf to get the output # but we have to sum on a diagonal through the Tf and Ts submatrix. if Tf == 1: # for Tf==1, no sum along Tf, the Ts-axis of the output is unchanged! out_5d = out_tmp.reshape((Ns, Ts, Nf, Hout, Wout)) else: # for some types of convolution, pad out_tmp with zeros if border_mode[0] == "valid": Tpad = 0 elif border_mode[0] == "full": Tpad = Tf - 1 elif border_mode[0] == "half": Tpad = Tf // 2 elif border_mode[0] == "same": raise NotImplementedError() else: raise ValueError("invalid border mode", border_mode[0]) if Tpad == 0: out_5d = diagonal_subtensor(out_tmp, 1, 3).sum(axis=3) else: # pad out_tmp with zeros before summing over the diagonal out_tmp_padded = tensor.zeros(dtype=out_tmp.dtype, shape=(Ns, Ts + 2 * Tpad, Nf, Tf, Hout, Wout)) out_tmp_padded = tensor.set_subtensor( out_tmp_padded[:, Tpad:(Ts + Tpad), :, :, :, :], out_tmp) out_5d = diagonal_subtensor(out_tmp_padded, 1, 3).sum(axis=3) return out_5d
def inner_fn(t, stm1, postm1, vtm1): # Use hidden state to generate action state aht = T.dot(Wa_aht_st, T.reshape(stm1, (n_s, n_proc))) + ba_aht #aht2 = T.dot(Wa_aht2_aht, T.reshape(aht,(n_s,n_proc))) + ba_aht2 #aht3 = T.dot(Wa_aht3_aht2, T.reshape(aht2,(n_s,n_proc))) + ba_aht3 atm1_mu = T.dot(Wa_atmu_aht, T.reshape(aht, (n_s, n_proc))) + ba_atmu atm1_sig = T.nnet.softplus( T.dot(Wa_atsig_aht, T.reshape(aht, (n_s, n_proc))) + ba_atsig) + sig_min_action # Sample Action atm1 = atm1_mu + theano_rng.normal((n_oa, n_proc)) * atm1_sig # Update Environment action_force = T.tanh(atm1) force = T.switch( T.lt(postm1, 0.0), -2 * postm1 - 1, -T.pow(1 + 5 * T.sqr(postm1), -0.5) - T.sqr(postm1) * T.pow(1 + 5 * T.sqr(postm1), -1.5) - T.pow(postm1, 4) / 16.0) - 0.25 * vtm1 vt = vtm1 + 0.05 * force + 0.03 * action_force post = postm1 + vt # Generate Sensory Inputs: # 1.) Observation of Last Action oat = atm1 # 2.) Noisy Observation of Current Position ot = post + theano_rng.normal((n_o, n_proc)) * 0.01 # 3.) Nonlinear Transformed Sensory Channel oht = T.exp(-T.sqr(post - 1.0) / 2.0 / 0.3 / 0.3) # Infer hidden state from last hidden state and current observations, using variational density hst = T.nnet.relu( T.dot(Wq_hst_stm1, T.reshape(stm1, (n_s, n_proc))) + T.dot(Wq_hst_ot, T.reshape(ot, (n_o, n_proc))) + T.dot(Wq_hst_oht, T.reshape(oht, (n_oh, n_proc))) + T.dot(Wq_hst_oat, T.reshape(oat, (n_oa, n_proc))) + bq_hst) hst2 = T.nnet.relu( T.dot(Wq_hst2_hst, T.reshape(hst, (n_s, n_proc))) + bq_hst2) stmu = T.tanh( T.dot(Wq_stmu_hst2, T.reshape(hst2, (n_s, n_proc))) + bq_stmu) stsig = T.nnet.softplus( T.dot(Wq_stsig_hst2, T.reshape(hst2, (n_s, n_proc))) + bq_stsig) + sig_min_states # Explicitly encode position as homeostatic state variable # Rescale representation to fit within linear response of the tanh-nonlinearity stmu = T.set_subtensor(stmu[0, :], 0.1 * ot[0, :]).reshape((n_s, n_proc)) stsig = T.set_subtensor(stsig[0, :], 0.005).reshape((n_s, n_proc)) # Sample from variational density st = stmu + theano_rng.normal((n_s, n_proc)) * stsig # Calculate parameters of likelihood distributions from sampled state ost = T.nnet.relu(T.dot(Wl_ost_st, T.reshape(st, (n_s, n_proc))) + bl_ost) ost2 = T.nnet.relu( T.dot(Wl_ost2_ost, T.reshape(ost, (n_s, n_proc))) + bl_ost2) ost3 = T.nnet.relu( T.dot(Wl_ost3_ost2, T.reshape(ost2, (n_s, n_proc))) + bl_ost3) otmu = T.dot(Wl_otmu_st, T.reshape(ost3, (n_s, n_proc))) + bl_otmu otsig = T.nnet.softplus( T.dot(Wl_otsig_st, T.reshape(ost3, (n_s, n_proc))) + bl_otsig) + sig_min_obs ohtmu = T.dot(Wl_ohtmu_st, T.reshape(ost3, (n_s, n_proc))) + bl_ohtmu ohtsig = T.nnet.softplus( T.dot(Wl_ohtsig_st, T.reshape(ost3, (n_s, n_proc))) + bl_ohtsig) + sig_min_obs oatmu = T.dot(Wl_oatmu_st, T.reshape(ost3, (n_s, n_proc))) + bl_oatmu oatsig = T.nnet.softplus( T.dot(Wl_oatsig_st, T.reshape(ost3, (n_s, n_proc))) + bl_oatsig) + sig_min_obs # Calculate negative log-likelihood of observations p_ot = GaussianNLL(ot, otmu, otsig) p_oht = GaussianNLL(oht, ohtmu, ohtsig) p_oat = GaussianNLL(oat, oatmu, oatsig) # Calculate prior expectation on hidden state from previous state prior_stmu = T.tanh( T.dot(Wl_stmu_stm1, T.reshape(stm1, (n_s, n_proc))) + bl_stmu) prior_stsig = T.nnet.softplus( T.dot(Wl_stsig_stm1, T.reshape(stm1, (n_s, n_proc))) + bl_stsig) + sig_min_states # Explicitly encode expectations on homeostatic state variable prior_stmu = ifelse(T.lt(t, 20), prior_stmu, T.set_subtensor(prior_stmu[0, :], 0.1)) prior_stsig = ifelse(T.lt(t, 20), prior_stsig, T.set_subtensor(prior_stsig[0, :], 0.005)) # Calculate KL divergence between variational density and prior density # using explicit formula for diagonal gaussians KL_st = KLGaussianGaussian(stmu, stsig, prior_stmu, prior_stsig) # Put free energy functional together FEt = KL_st + p_ot + p_oht + p_oat return st, post, vt, oat, ot, oht, FEt, KL_st, stmu, stsig, force, p_ot, p_oht, p_oat
def interleave_blanks(Y): Y_ = T.alloc(-1, Y.shape[0] * 2 + 1) Y_ = T.set_subtensor(Y_[T.arange(Y.shape[0]) * 2 + 1], Y) return Y_
def compute_landmarks_helper(self, moms, init_landmarks): moms = T.reshape(moms[:136], (68, 2)) # 68 * 2 init_landmarks = T.reshape(init_landmarks[:136], (68, 2)) # 68 * 2 mask = T.zeros((68, 2)) mask = T.set_subtensor(mask[0:9, :], np.ones((9, 2))) initLandmarks_aftmas = init_landmarks * mask moms_aftmas = moms * mask dp = T.zeros((68, 2)) dp1 = T.zeros((68, 2)) initLandmarks_loca1 = T.alloc(initLandmarks_aftmas[0, :], 68, 2) initLandmarks_loca1_aftmas = initLandmarks_loca1 * mask initLandmarks_loca2 = T.alloc(initLandmarks_aftmas[1, :], 68, 2) initLandmarks_loca2_aftmas = initLandmarks_loca2 * mask initLandmarks_loca3 = T.alloc(initLandmarks_aftmas[2, :], 68, 2) initLandmarks_loca3_aftmas = initLandmarks_loca3 * mask initLandmarks_loca4 = T.alloc(initLandmarks_aftmas[3, :], 68, 2) initLandmarks_loca4_aftmas = initLandmarks_loca4 * mask initLandmarks_loca5 = T.alloc(initLandmarks_aftmas[4, :], 68, 2) initLandmarks_loca5_aftmas = initLandmarks_loca5 * mask initLandmarks_loca6 = T.alloc(initLandmarks_aftmas[5, :], 68, 2) initLandmarks_loca6_aftmas = initLandmarks_loca6 * mask initLandmarks_loca7 = T.alloc(initLandmarks_aftmas[6, :], 68, 2) initLandmarks_loca7_aftmas = initLandmarks_loca7 * mask initLandmarks_loca8 = T.alloc(initLandmarks_aftmas[7, :], 68, 2) initLandmarks_loca8_aftmas = initLandmarks_loca8 * mask initLandmarks_loca9 = T.alloc(initLandmarks_aftmas[8, :], 68, 2) initLandmarks_loca9_aftmas = initLandmarks_loca9 * mask weight1 = T.zeros((68, 2)) weight1_val = T.exp(-T.sum( (initLandmarks_loca1_aftmas - initLandmarks_aftmas)**2, axis=1) / self.sigmaV2) weight1 = T.set_subtensor(weight1[:, 0], weight1_val) weight1 = T.set_subtensor(weight1[:, 1], weight1_val) val1 = T.sum(weight1 * moms_aftmas, axis=0) dp = T.set_subtensor(dp[0, :], val1) weight2 = T.zeros((68, 2)) weight2_val = T.exp(-T.sum( (initLandmarks_loca2_aftmas - initLandmarks_aftmas)**2, axis=1) / self.sigmaV2) weight2 = T.set_subtensor(weight2[:, 0], weight2_val) weight2 = T.set_subtensor(weight2[:, 1], weight2_val) val2 = T.sum(weight2 * moms_aftmas, axis=0) dp = T.set_subtensor(dp[1, :], val2) weight3 = T.zeros((68, 2)) weight3_val = T.exp(-T.sum( (initLandmarks_loca3_aftmas - initLandmarks_aftmas)**2, axis=1) / self.sigmaV2) weight3 = T.set_subtensor(weight3[:, 0], weight3_val) weight3 = T.set_subtensor(weight3[:, 1], weight3_val) val3 = T.sum(weight3 * moms_aftmas, axis=0) dp = T.set_subtensor(dp[2, :], val3) weight4 = T.zeros((68, 2)) weight4_val = T.exp(-T.sum( (initLandmarks_loca4_aftmas - initLandmarks_aftmas)**2, axis=1) / self.sigmaV2) weight4 = T.set_subtensor(weight4[:, 0], weight4_val) weight4 = T.set_subtensor(weight4[:, 1], weight4_val) val4 = T.sum(weight4 * moms_aftmas, axis=0) dp = T.set_subtensor(dp[3, :], val4) weight5 = T.zeros((68, 2)) weight5_val = T.exp(-T.sum( (initLandmarks_loca5_aftmas - initLandmarks_aftmas)**2, axis=1) / self.sigmaV2) weight5 = T.set_subtensor(weight5[:, 0], weight5_val) weight5 = T.set_subtensor(weight5[:, 1], weight5_val) val5 = T.sum(weight5 * moms_aftmas, axis=0) dp = T.set_subtensor(dp[4, :], val5) weight6 = T.zeros((68, 2)) weight6_val = T.exp(-T.sum( (initLandmarks_loca6_aftmas - initLandmarks_aftmas)**2, axis=1) / self.sigmaV2) weight6 = T.set_subtensor(weight6[:, 0], weight6_val) weight6 = T.set_subtensor(weight6[:, 1], weight6_val) val6 = T.sum(weight6 * moms_aftmas, axis=0) dp = T.set_subtensor(dp[5, :], val6) weight7 = T.zeros((68, 2)) weight7_val = T.exp(-T.sum( (initLandmarks_loca7_aftmas - initLandmarks_aftmas)**2, axis=1) / self.sigmaV2) weight7 = T.set_subtensor(weight7[:, 0], weight7_val) weight7 = T.set_subtensor(weight7[:, 1], weight7_val) val7 = T.sum(weight7 * moms_aftmas, axis=0) dp = T.set_subtensor(dp[6, :], val7) weight8 = T.zeros((68, 2)) weight8_val = T.exp(-T.sum( (initLandmarks_loca8_aftmas - initLandmarks_aftmas)**2, axis=1) / self.sigmaV2) weight8 = T.set_subtensor(weight8[:, 0], weight8_val) weight8 = T.set_subtensor(weight8[:, 1], weight8_val) val8 = T.sum(weight8 * moms_aftmas, axis=0) dp = T.set_subtensor(dp[7, :], val8) weight9 = T.zeros((68, 2)) weight9_val = T.exp(-T.sum( (initLandmarks_loca9_aftmas - initLandmarks_aftmas)**2, axis=1) / self.sigmaV2) weight9 = T.set_subtensor(weight9[:, 0], weight9_val) weight9 = T.set_subtensor(weight9[:, 1], weight9_val) val9 = T.sum(weight9 * moms_aftmas, axis=0) dp = T.set_subtensor(dp[8, :], val9) deformedShape = initLandmarks_aftmas + (dp * self.tau) deformedShape_loca1 = T.alloc(deformedShape[0, :], 68, 2) deformedShape_loca2 = T.alloc(deformedShape[1, :], 68, 2) deformedShape_loca3 = T.alloc(deformedShape[2, :], 68, 2) deformedShape_loca4 = T.alloc(deformedShape[3, :], 68, 2) deformedShape_loca5 = T.alloc(deformedShape[4, :], 68, 2) deformedShape_loca6 = T.alloc(deformedShape[5, :], 68, 2) deformedShape_loca7 = T.alloc(deformedShape[6, :], 68, 2) deformedShape_loca8 = T.alloc(deformedShape[7, :], 68, 2) deformedShape_loca9 = T.alloc(deformedShape[8, :], 68, 2) weight11 = T.zeros((68, 2)) weight11_val = T.exp(-T.sum( (deformedShape_loca1 - deformedShape)**2, axis=1) / self.sigmaV2) weight11 = T.set_subtensor(weight11[:, 0], weight11_val) weight11 = T.set_subtensor(weight11[:, 1], weight11_val) val11 = T.sum(weight11 * moms_aftmas, axis=0) dp1 = T.set_subtensor(dp1[0, :], val11) weight22 = T.zeros((68, 2)) weight22_val = T.exp(-T.sum( (deformedShape_loca2 - deformedShape)**2, axis=1) / self.sigmaV2) weight22 = T.set_subtensor(weight22[:, 0], weight22_val) weight22 = T.set_subtensor(weight22[:, 1], weight22_val) val22 = T.sum(weight22 * moms_aftmas, axis=0) dp1 = T.set_subtensor(dp1[1, :], val22) weight33 = T.zeros((68, 2)) weight33_val = T.exp(-T.sum( (deformedShape_loca3 - deformedShape)**2, axis=1) / self.sigmaV2) weight33 = T.set_subtensor(weight33[:, 0], weight33_val) weight33 = T.set_subtensor(weight33[:, 1], weight33_val) val33 = T.sum(weight33 * moms_aftmas, axis=0) dp1 = T.set_subtensor(dp1[2, :], val33) weight44 = T.zeros((68, 2)) weight44_val = T.exp(-T.sum( (deformedShape_loca4 - deformedShape)**2, axis=1) / self.sigmaV2) weight44 = T.set_subtensor(weight44[:, 0], weight44_val) weight44 = T.set_subtensor(weight44[:, 1], weight44_val) val44 = T.sum(weight44 * moms_aftmas, axis=0) dp1 = T.set_subtensor(dp1[3, :], val44) weight55 = T.zeros((68, 2)) weight55_val = T.exp(-T.sum( (deformedShape_loca5 - deformedShape)**2, axis=1) / self.sigmaV2) weight55 = T.set_subtensor(weight55[:, 0], weight55_val) weight55 = T.set_subtensor(weight55[:, 1], weight55_val) val55 = T.sum(weight55 * moms_aftmas, axis=0) dp1 = T.set_subtensor(dp1[4, :], val55) weight66 = T.zeros((68, 2)) weight66_val = T.exp(-T.sum( (deformedShape_loca6 - deformedShape)**2, axis=1) / self.sigmaV2) weight66 = T.set_subtensor(weight66[:, 0], weight66_val) weight66 = T.set_subtensor(weight66[:, 1], weight66_val) val66 = T.sum(weight66 * moms_aftmas, axis=0) dp1 = T.set_subtensor(dp1[5, :], val66) weight77 = T.zeros((68, 2)) weight77_val = T.exp(-T.sum( (deformedShape_loca7 - deformedShape)**2, axis=1) / self.sigmaV2) weight77 = T.set_subtensor(weight77[:, 0], weight77_val) weight77 = T.set_subtensor(weight77[:, 1], weight77_val) val77 = T.sum(weight77 * moms_aftmas, axis=0) dp1 = T.set_subtensor(dp1[6, :], val77) weight88 = T.zeros((68, 2)) weight88_val = T.exp(-T.sum( (deformedShape_loca8 - deformedShape)**2, axis=1) / self.sigmaV2) weight88 = T.set_subtensor(weight88[:, 0], weight88_val) weight88 = T.set_subtensor(weight88[:, 1], weight88_val) val88 = T.sum(weight88 * moms_aftmas, axis=0) dp1 = T.set_subtensor(dp1[7, :], val88) weight99 = T.zeros((68, 2)) weight99_val = T.exp(-T.sum( (deformedShape_loca9 - deformedShape)**2, axis=1) / self.sigmaV2) weight99 = T.set_subtensor(weight99[:, 0], weight99_val) weight99 = T.set_subtensor(weight99[:, 1], weight99_val) val99 = T.sum(weight99 * moms_aftmas, axis=0) dp1 = T.set_subtensor(dp1[8, :], val99) output = (deformedShape + dp1 * self.tau).flatten() return output
def test_setsubtensor2(self): tv = numpy.asarray(self.rng.uniform(size=(10,)), theano.config.floatX) t = theano.shared(tv) out = tensor.set_subtensor(t[:4], self.x[:4]) self.check_rop_lop(out, (10,))
def clip_around_zero(x, threshold=0.2): indicies = T.bitwise_and(x < threshold, x > -threshold) return T.set_subtensor(x[indicies.nonzero()], 0)
def scan(fn, sequences=None, outputs_info=None, non_sequences=None, n_steps=None, truncate_gradient=-1, go_backwards=False, mode=None, name=None, options=None, profile=False): """ This function constructs and applies a Scan op to the provided arguments. :param fn: ``fn`` is a function that describes the operations involved in one step of ``scan``. ``fn`` should construct variables describing the output of one iteration step. It should expect as input theano variables representing all the slices of the input sequences and previous values of the outputs, as well as all other arguments given to scan as ``non_sequences``. The order in which scan passes these variables to ``fn`` is the following : * all time slices of the first sequence * all time slices of the second sequence * ... * all time slices of the last sequence * all past slices of the first output * all past slices of the second otuput * ... * all past slices of the last output * all other arguments (the list given as `non_sequences` to scan) The order of the sequences is the same as the one in the list `sequences` given to scan. The order of the outputs is the same as the order of ``output_info``. For any sequence or output the order of the time slices is the same as the one in which they have been given as taps. For example if one writes the following : .. code-block:: python scan(fn, sequences = [ dict(input= Sequence1, taps = [-3,2,-1]) , Sequence2 , dict(input = Sequence3, taps = 3) ] , outputs_info = [ dict(initial = Output1, taps = [-3,-5]) , dict(initial = Output2, taps = None) , Output3 ] , non_sequences = [ Argument1, Argument 2]) ``fn`` should expect the following arguments in this given order: #. ``Sequence1[t-3]`` #. ``Sequence1[t+2]`` #. ``Sequence1[t-1]`` #. ``Sequence2[t]`` #. ``Sequence3[t+3]`` #. ``Output1[t-3]`` #. ``Output1[t-5]`` #. ``Output3[t-1]`` #. ``Argument1`` #. ``Argument2`` The list of ``non_sequences`` can also contain shared variables used in the function, though ``scan`` is able to figure those out on its own so they can be skipped. For the clarity of the code we recommand though to provide them to scan. To some extend ``scan`` can also figure out other ``non sequences`` (not shared) even if not passed to scan (but used by `fn`). A simple example of this would be : .. code-block:: python import theano.tensor as TT W = TT.matrix() W_2 = W**2 def f(x): return TT.dot(x,W_2) The function is expected to return two things. One is a list of outputs ordered in the same order as ``outputs_info``, with the difference that there should be only one output variable per output initial state (even if no tap value is used). Secondly `fn` should return an update dictionary (that tells how to update any shared variable after each iteration step). The dictionary can optionally be given as a list of tuples. There is no constraint on the order of these two list, ``fn`` can return either ``(outputs_list, update_dictionary)`` or ``(update_dictionary, outputs_list)`` or just one of the two (in case the other is empty). To use ``scan`` as a while loop, the user needs to change the function ``fn`` such that also a stopping condition is returned. To do so, he/she needs to wrap the condition in an ``until`` class. The condition should be returned as a third element, for example: .. code-block:: python ... return [y1_t, y2_t], {x:x+1}, theano.scan_module.until(x < 50) Note that a number of steps (considered in here as the maximum number of steps ) is still required even though a condition is passed (and it is used to allocate memory if needed). = {}): :param sequences: ``sequences`` is the list of Theano variables or dictionaries describing the sequences ``scan`` has to iterate over. If a sequence is given as wrapped in a dictionary, then a set of optional information can be provided about the sequence. The dictionary should have the following keys: * ``input`` (*mandatory*) -- Theano variable representing the sequence. * ``taps`` -- Temporal taps of the sequence required by ``fn``. They are provided as a list of integers, where a value ``k`` impiles that at iteration step ``t`` scan will pass to ``fn`` the slice ``t+k``. Default value is ``[0]`` Any Theano variable in the list ``sequences`` is automatically wrapped into a dictionary where ``taps`` is set to ``[0]`` :param outputs_info: ``outputs_info`` is the list of Theano variables or dictionaries describing the initial state of the outputs computed recurrently. When this initial states are given as dictionary optional information can be provided about the output corresponding to these initial states. The dictionary should have the following keys: * ``initial`` -- Theano variable that represents the initial state of a given output. In case the output is not computed recursively (think of a map) and does not require a initial state this field can be skiped. Given that only the previous time step of the output is used by ``fn`` the initial state should have the same shape as the output. If multiple time taps are used, the initial state should have one extra dimension that should cover all the possible taps. For example if we use ``-5``, ``-2`` and ``-1`` as past taps, at step 0, ``fn`` will require (by an abuse of notation) ``output[-5]``, ``output[-2]`` and ``output[-1]``. This will be given by the initial state, which in this case should have the shape (5,)+output.shape. If this variable containing the initial state is called ``init_y`` then ``init_y[0]`` *corresponds to* ``output[-5]``. ``init_y[1]`` *correponds to* ``output[-4]``, ``init_y[2]`` corresponds to ``output[-3]``, ``init_y[3]`` coresponds to ``output[-2]``, ``init_y[4]`` corresponds to ``output[-1]``. While this order might seem strange, it comes natural from splitting an array at a given point. Assume that we have a array ``x``, and we choose ``k`` to be time step ``0``. Then our initial state would be ``x[:k]``, while the output will be ``x[k:]``. Looking at this split, elements in ``x[:k]`` are ordered exactly like those in ``init_y``. * ``taps`` -- Temporal taps of the output that will be pass to ``fn``. They are provided as a list of *negative* integers, where a value ``k`` implies that at iteration step ``t`` scan will pass to ``fn`` the slice ``t+k``. ``scan`` will follow this logic if partial information is given: * If an output is not wrapped in a dictionary, ``scan`` will wrap it in one assuming that you use only the last step of the output (i.e. it makes your tap value list equal to [-1]). * If you wrap an output in a dictionary and you do not provide any taps but you provide an initial state it will assume that you are using only a tap value of -1. * If you wrap an output in a dictionary but you do not provide any initial state, it assumes that you are not using any form of taps. * If you provide a ``None`` instead of a variable or a empty dictionary ``scan`` assumes that you will not use any taps for this output (like for example in case of a map) If ``outputs_info`` is an empty list or None, ``scan`` assumes that no tap is used for any of the outputs. If information is provided just for a subset of the outputs an exception is raised (because there is no convention on how scan should map the provided information to the outputs of ``fn``) :param non_sequences: ``non_sequences`` is the list of arguments that are passed to ``fn`` at each steps. One can opt to exclude variable used in ``fn`` from this list as long as they are part of the computational graph, though for clarity we encourage not to do so. :param n_steps: ``n_steps`` is the number of steps to iterate given as an int or Theano scalar. If any of the input sequences do not have enough elements, scan will raise an error. If the *value is 0* the outputs will have *0 rows*. If the value is negative, ``scan`` will run backwards in time. If the ``go_backwards`` flag is already set and also ``n_steps`` is negative, ``scan`` will run forward in time. If n stpes is not provided, ``scan`` will figure out the amount of steps it should run given its input sequences. :param truncate_gradient: ``truncate_gradient`` is the number of steps to use in truncated BPTT. If you compute gradients through a scan op, they are computed using backpropagation through time. By providing a different value then -1, you choose to use truncated BPTT instead of classical BPTT, where you go for only ``truncate_gradient`` number of steps back in time. :param go_backwards: ``go_backwards`` is a flag indicating if ``scan`` should go backwards through the sequences. If you think of each sequence as indexed by time, making this flag True would mean that ``scan`` goes back in time, namely that for any sequence it starts from the end and goes towards 0. :param name: When profiling ``scan``, it is crucial to provide a name for any instance of ``scan``. The profiler will produce an overall profile of your code as well as profiles for the computation of one step of each instance of ``scan``. The ``name`` of the instance appears in those profiles and can greatly help to disambiguate information. :param mode: It is recommended to leave this argument to None, especially when profiling ``scan`` (otherwise the results are not going to be accurate). If you prefer the computations of one step of ``scan`` to be done differently then the entire function, you can use this parameter to describe how the computations in this loop are done (see ``theano.function`` for details about possible values and their meaning). :param profile: Flag or string. If true, or different from the empty string, a profile object will be created and attached to the inner graph of scan. In case ``profile`` is True, the profile object will have the name of the scan instance, otherwise it will have the passed string. Profile object collect (and print) information only when running the inner graph with the new cvm linker ( with default modes, other linkers this argument is useless) :rtype: tuple :return: tuple of the form (outputs, updates); ``outputs`` is either a Theano variable or a list of Theano variables representing the outputs of ``scan`` (in the same order as in ``outputs_info``). ``updates`` is a subclass of dictionary specifying the update rules for all shared variables used in scan This dictionary should be passed to ``theano.function`` when you compile your function. The change compared to a normal dictionary is that we validate that keys are SharedVariable and addition of those dictionary are validated to be consistent. """ # Note : see the internal documentation of the scan op for naming # conventions and all other details if options is None: options = {} rvals = scan_utils.canonical_arguments(sequences, outputs_info, non_sequences, go_backwards, n_steps) inputs, states_and_outputs_info, parameters, T = rvals # If we provided a known number of steps ( before compilation) # and if that number is 1 or -1, then we can skip the Scan Op, # and just apply the inner function once # To do that we check here to see the nature of n_steps T_value = None if isinstance(n_steps, (float, int)): T_value = int(n_steps) else: try: T_value = opt.get_constant_value(n_steps) except (TypeError, AttributeError): T_value = None if T_value in (1, -1): return one_step_scan(fn, inputs, states_and_outputs_info, parameters, truncate_gradient) # 1. Variable representing the current time step t = scalar_shared(numpy.int64(0), name='t') # 2. Allocate memory for the states of scan. mintaps = [] lengths = [] for pos, arg_info in enumerate(states_and_outputs_info): if arg_info.get('taps', None) == [-1]: mintaps.append(1) lengths.append(scalar_shared(numpy.int64(0), name='l%d' % pos)) arg_info['initial'] = scan_utils.expand(tensor.unbroadcast( tensor.shape_padleft(arg_info['initial']), 0), T) elif arg_info.get('taps', None): if numpy.any(numpy.array(arg_info.get('taps', [])) > 0): # Make sure we do not have requests for future values of a # sequence we can not provide such values raise ValueError('Can not use future taps of outputs', arg_info) mintap = abs(numpy.min(arg_info['taps'])) lengths.append(scalar_shared(numpy.int64(0), name='l%d' % pos)) mintaps.append(mintap) arg_info['initial'] = scan_utils.expand( arg_info['initial'][:mintap], T) else: mintaps.append(0) lengths.append(scalar_shared(numpy.int64(0), name='l%d' % pos)) # 3. Generate arguments for the function passed to scan. This will # function will return the outputs that need to be computed at every # timesteps inputs_slices = [input[t] for input in inputs] states_slices = [] for n, state in enumerate(states_and_outputs_info): # Check if it is actually a state and not an output if mintaps[n] != 0: for k in state['taps']: states_slices.append( state['initial'][(t + mintaps[n] + k) % lengths[n]]) # 4. Construct outputs that are to be computed by the inner # function of scan args = inputs_slices + states_slices + parameters cond, states_and_outputs, updates = \ scan_utils.get_updates_and_outputs(fn(*args)) # User is allowed to provide no information if it only behaves like a # map if (len(states_and_outputs) != len(states_and_outputs_info) and len(states_and_outputs_info) == 0): mintaps = [0] * len(states_and_outputs) # 5. Construct the scan op # 5.1 Construct list of shared variables with updates (those that # can be treated as states (i.e. of TensorType) and those that can not # (like Random States) if cond is not None: _cond = [cond] else: _cond = [] rvals = rebuild_collect_shared( states_and_outputs + _cond, updates=updates, rebuild_strict=True, copy_inputs_over=True, no_default_updates=False) # extracting the arguments input_variables, cloned_outputs, other_rval = rvals clone_d, update_d, update_expr, shared_inputs = other_rval additional_input_states = [] additional_output_states = [] additional_lengths = [] additional_mintaps = [] original_numeric_shared_variables = [] non_numeric_input_states = [] non_numeric_output_states = [] original_non_numeric_shared_variables = [] pos = len(lengths) for sv in shared_inputs: if sv in update_d: if isinstance(sv, TensorType): # We can treat it as a sit sot nw_state = scan_utils.expand( tensor.unbroadcast(tensor.shape_padleft(sv, 0), T)) additional_lengths.append(scalar_shared(numpy.int64(0), name='l%d' % pos)) pos = pos + 1 additional_mintaps.append(1) additional_input_states.append(nw_state) additional_output_states.append( scan_utils.clone(tensor.set_subtensor( nw_state[(t + 1) % additional_lengths[-1]], update_d[sv]))) original_numeric_shared_variables.append(sv) else: non_numeric_input_states.append(sv) non_numeric_output_states.append(update_d[sv]) original_non_numeric_shared_variables.append(sv) # 5.2 Collect inputs/outputs of the inner function inputs = [] outputs = [] for n, mintap in enumerate(mintaps): if mintap != 0: input_state = states_and_outputs_info[n]['initial'] inputs.append(input_state) outputs.append( tensor.set_subtensor( input_state[(t + mintap) % lengths[n]], states_and_outputs[n])) else: mem_buffer = scan_utils.allocate_memory( T, states_and_outputs_info[n], states_and_outputs[n]) inputs.append(output) outputs.append( tensor.set_subtensor(output[t % lengths[n]], states_and_outputs[n])) inputs.extend(additional_input_states) outputs.extend(additional_output_states) lengths.extend(additional_lengths) mintaps.extend(additional_mintaps) inputs.extend(non_numeric_input_states) outputs.extend(non_numeric_output_states) all_other_inputs = gof.graph.inputs(outputs) parameters = [x for x in all_other_inputs if (x not in inputs and x not in lengths and x is not t and isinstance(x, gof.Variable) and not isinstance(x, gof.Constant))] inputs.extend(parameters) # 5.3 Construct the the options dictionary options['name'] = name options['profile'] = profile options['mode'] = mode options['inplace'] = False options['gpu'] = False options['truncate_gradient'] = truncate_gradient options['hash_inner_graph'] = 0 # 5.4 Construct the ScanOp instance local_op = scan_op.ScanOp(inputs=inputs, outputs=outputs, lengths=lengths, switches=[], mintaps=mintaps, index=t, options=options, as_repeatUntil=cond) # Note that we get here all the outputs followed by the update rules to # the shared variables we had in our scan # we know that we have (in this given order): # * len(states_and_outputs) real outputs # * len(additional_input_states) updates for numeric shared variable # * len(non_numeric_input_states) updates for non numeric shared # variables scan_inputs = [T] + inputs scan_outputs_update_rules = scan_utils.to_list(local_op(*scan_inputs)) # 5.5 Collect outputs and add permutation object scan_outputs = [] for pos in xrange(len(states_and_outputs)): out = scan_utils.ScanPermutation(mintaps[pos])( scan_outputs_update_rules[pos], t) scan_outputs.append(out[mintap:]) # 5.6 Construct updates dictionary update_rules = scan_outputs_update_rules[len(states_and_outputs):] updates = {} for v, u in izip(original_numeric_shared_variables, update_rules[:len(additional_input_states)]): updates[v] = u[-1] for v, u in izip(original_non_numeric_shared_variables, update_rules[len(additional_input_states):]): updates[v] = u # Step 5.7 We are done and can return everything back to the user return scan_outputs, updates
def encode(self, t, vecs): # vecs[t[0]] and vecs[t[0]] ==> vecs[t[2]] w_left, w_right = vecs[t[0]], vecs[t[1]] z, loss_rec = self.compose(w_left, w_right) return T.set_subtensor(vecs[t[2]], z), loss_rec
def inner(rot_param, base_relative): tr = T.eye(4, dtype=base_relative.dtype) R = euler_angles_to_rotation_matrix(rot_param) tr = T.set_subtensor(tr[:3, :3], R) return T.dot(base_relative, tr)
def test_setsubtensor1(self): tv = numpy.asarray(self.rng.uniform(size=(3,)), theano.config.floatX) t = theano.shared(tv) out = tensor.set_subtensor(self.x[:3], t) self.check_rop_lop(out, self.in_shape)
def _batchAlign(self, w_tb, mask_b): mask_b = T.set_subtensor(mask_b[self.tokmap[w_tb]], 0) return mask_b
def compute_absolute(i, parent, relative, absolutes): # hack (parent == -1 accesses last element - we set it to zero) # Theano did not take ifselse here absolutes = T.set_subtensor(absolutes[i], T.dot(absolutes[parent], relative)) return absolutes
def __theano_trainx__(self, n_in, n_hidden): """ 训练阶段跑一遍训练序列 """ # self.alpha_lambda = ['alpha', 'lambda', 'fea_random_zero'] uix, whx = self.uix, self.whx tra_mask = T.imatrix() # shape=(n, 157) actual_batch_size = tra_mask.shape[0] seq_length = T.max(T.sum(tra_mask, axis=1)) # 获取mini-batch里各序列的长度最大值作为seq_length mask = tra_mask.T # shape=(157, n) h0x = T.alloc(self.h0x, actual_batch_size, n_hidden) # shape=(n, 40) bix = T.alloc(self.bix, actual_batch_size, 3, n_hidden) # shape=(n, 3, 40), n_hidden放在最后 bix = bix.dimshuffle(1, 2, 0) # shape=(3, 40, n) # 输入端:只输入购买的商品即可。 pidxs, qidxs = T.imatrix(), T.imatrix() # TensorType(int32, matrix) ixps = self.lt[pidxs] # shape((actual_batch_size, seq_length, n_in)) ixps = ixps.dimshuffle(1, 0, 2) # shape=(seq_length, batch_size, n_in) uiq_ps = Unique(False, False, False)(pidxs) # 再去重 uiq_ix = self.lt[uiq_ps] # 输出端:h*w 得到score yxps, yxqs = self.vyx[pidxs], self.vyx[qidxs] yxps, yxqs = yxps.dimshuffle(1, 0, 2), yxqs.dimshuffle(1, 0, 2) pqs = T.concatenate((pidxs, qidxs)) # 先拼接 uiq_pqs = Unique(False, False, False)(pqs) # 再去重 uiq_yx = self.vyx[uiq_pqs] """ 输入t时刻正负样本、t-1时刻隐层,计算当前隐层、当前损失. 公式里省略了时刻t # 根据性质:T.dot((m, n), (n, ))得到shape=(m, ),且是矩阵每行与(n, )相乘 # GRU z = sigmoid(ux_z * xp + wh_z * h_pre1) r = sigmoid(ux_r * xp + wh_r * h_pre1) c = tanh(ux_c * xp + wh_c * (r 点乘 h_pre1)) h = z * h_pre1 + (1.0 - z) * c # 根据性质:T.dot((n, ), (n, ))得到scalar upq = h_pre1 * (xp - xq) loss = log(1.0 + e^(-upq)) """ def recurrence(ixp_t, yxp_t, yxq_t, mask_t, hx_t_pre1): # 特征、隐层都处理成shape=(batch_size, n_hidden)=(n, 20) z_rx = sigmoid(T.dot(uix[:2], ixp_t.T) + T.dot(whx[:2], hx_t_pre1.T) + bix[:2]) # shape=(2, 20, n) zx, rx = z_rx[0].T, z_rx[1].T # shape=(n, 20) cx = tanh(T.dot(uix[2], ixp_t.T) + T.dot(whx[2], (rx * hx_t_pre1).T) + bix[2]) # shape=(20, n) hx_t = (T.ones_like(zx) - zx) * hx_t_pre1 + zx * cx.T # shape=(n, 20) # 偏好误差 upq_t = T.sum(hx_t_pre1 * (yxp_t - yxq_t), axis=1) # shape=(n, ) loss_t = T.log(sigmoid(upq_t)) # shape=(n, ) loss_t *= mask_t # 只在损失这里乘一下0/1向量就可以了 return [hx_t, loss_t] # shape=(n, 20), (n, ) [hx, loss], _ = theano.scan( fn=recurrence, sequences=[ixps, yxps, yxqs, mask], outputs_info=[h0x, None], n_steps=seq_length) # 保证只循环到最长有效位 # ---------------------------------------------------------------------------- # cost, gradients, learning rate, l2 regularization lr, l2 = self.alpha_lambda[0], self.alpha_lambda[1] seq_l2_sq = ( T.sum([T.sum(par ** 2) for par in [uix, whx, yxps, yxqs, ixps]]) + T.sum([T.sum(par ** 2) for par in [bix]]) / actual_batch_size) upq = T.sum(loss) seq_costs = ( - upq / actual_batch_size + 0.5 * l2 * seq_l2_sq) seq_grads = T.grad(seq_costs, self.paramsx) seq_updates = [(par, par - lr * gra) for par, gra in zip(self.paramsx, seq_grads)] update_ix = T.set_subtensor(uiq_ix, uiq_ix - lr * T.grad(seq_costs, self.lt)[uiq_ps]) update_yx = T.set_subtensor(uiq_yx, uiq_yx - lr * T.grad(seq_costs, self.vyx)[uiq_pqs]) seq_updates.append((self.lt, update_ix)) seq_updates.append((self.vyx, update_yx)) # 会直接更改到seq_updates里 # ---------------------------------------------------------------------------- # 输入正、负样本序列及其它参数后,更新变量,返回损失。 # givens给数据 start_end = T.ivector() self.seq_trainx = theano.function( inputs=[start_end], outputs=-upq, updates=seq_updates, givens={ pidxs: self.tra_buys_masks[start_end], # 类型是 TensorType(int32, matrix) qidxs: self.tra_buys_neg_masks[start_end], # T.ivector()类型是 TensorType(int32, vector) tra_mask: self.tra_masks[start_end]})
def euler_angles_to_rotation_matrix(xzy): tx = xzy[0] ty = xzy[2] tz = xzy[1] Rx = T.eye(3, dtype=tx.dtype) Rx = T.set_subtensor(Rx[1, 1], T.cos(tx)) Rx = T.set_subtensor(Rx[2, 1], T.sin(tx)) Rx = T.set_subtensor(Rx[1, 2], -Rx[2, 1]) Rx = T.set_subtensor(Rx[2, 2], Rx[1, 1]) Ry = T.eye(3, dtype=tx.dtype) Ry = T.set_subtensor(Ry[0, 0], T.cos(ty)) Ry = T.set_subtensor(Ry[0, 2], T.sin(ty)) Ry = T.set_subtensor(Ry[2, 0], -Ry[0, 2]) Ry = T.set_subtensor(Ry[2, 2], Ry[0, 0]) Rz = T.eye(3, dtype=tx.dtype) Rz = T.set_subtensor(Rz[0, 0], T.cos(tz)) Rz = T.set_subtensor(Rz[1, 0], T.sin(tz)) Rz = T.set_subtensor(Rz[0, 1], -Rz[1, 0]) Rz = T.set_subtensor(Rz[1, 1], Rz[0, 0]) return T.dot(T.dot(Rz, Ry), Rx)
def run_experiment(self, dataset, word_embedding, exp_name): # load parameters num_maps_word = self.options["num_maps_word"] drop_rate_word = self.options["drop_rate_word"] drop_rate_sentence = self.options["drop_rate_sentence"] word_window = self.options["word_window"] word_dim = self.options["word_dim"] k_max_word = self.options["k_max_word"] k_max_sentence = self.options["k_max_sentence"] batch_size = self.options["batch_size"] rho = self.options["rho"] epsilon = self.options["epsilon"] norm_lim = self.options["norm_lim"] max_iteration = self.options["max_iteration"] k_portion = self.options["k_portion"] num_maps_sentence = self.options["num_maps_sentence"] sentence_window = self.options["sentence_window"] sentence_len = len(dataset[0][0][0][0]) sentence_num = len(dataset[0][0][0]) # compute the sentence flags train_flags, test_flags = construct_sentence_flag(dataset) train_k_value = construct_dynamic_k(train_flags, k_portion) test_k_value = construct_dynamic_k(test_flags, k_portion) train_flags = theano.shared(value=np.asarray( train_flags, dtype=theano.config.floatX), borrow=True) test_flags = theano.shared(value=np.asarray( test_flags, dtype=theano.config.floatX), borrow=True) train_k = theano.shared(value=np.asarray(train_k_value, dtype=theano.config.floatX), borrow=True) test_k = theano.shared(value=np.asarray(test_k_value, dtype=theano.config.floatX), borrow=True) # define the parameters x = T.tensor3("x") y = T.ivector("y") sen_flags = T.matrix("flag") sen_k = T.matrix("sen_k") rng = np.random.RandomState(1234) words = theano.shared(value=np.asarray(word_embedding, dtype=theano.config.floatX), name="embedding", borrow=True) zero_vector_tensor = T.vector() zero_vec = np.zeros(word_dim, dtype=theano.config.floatX) set_zero = theano.function( [zero_vector_tensor], updates=[(words, T.set_subtensor(words[0, :], zero_vector_tensor))]) x_emb = words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0] * x.shape[1], 1, x.shape[2], words.shape[1])) dropout_x_emb = nn.dropout_from_layer(rng, x_emb, drop_rate_word) # compute convolution on words layer word_filter_shape = (num_maps_word, 1, word_window, word_dim) word_pool_size = (sentence_len - word_window + 1, 1) dropout_word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb, input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word) sent_vec_dim = num_maps_word * k_max_word dropout_sent_vec = dropout_word_conv.output.reshape( (x.shape[0], 1, x.shape[1], sent_vec_dim)) dropout_sent_vec = nn.dropout_from_layer(rng, dropout_sent_vec, drop_rate_sentence) word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb * (1 - drop_rate_word), input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word, W=dropout_word_conv.W, b=dropout_word_conv.b) sent_vec = word_conv.output.reshape( (x.shape[0], 1, x.shape[1], sent_vec_dim)) sent_vec = sent_vec * (1 - drop_rate_sentence) # construct doc level context information sent_filter_shape = (num_maps_sentence, 1, sentence_window, sent_vec_dim) sent_pool_size = (sentence_num - sentence_window + 1, 1) dropout_sent_conv = nn.ConvPoolLayer(rng, input=dropout_sent_vec, input_shape=None, filter_shape=sent_filter_shape, pool_size=sent_pool_size, activation=Tanh, k=k_max_sentence) sent_conv = nn.ConvPoolLayer(rng, input=sent_vec, input_shape=None, filter_shape=sent_filter_shape, pool_size=sent_pool_size, activation=Tanh, k=k_max_sentence, W=dropout_sent_conv.W, b=dropout_sent_conv.b) # reshape the sentence vec dropout_sent_vec = dropout_sent_vec.reshape( (x.shape[0], x.shape[1], sent_vec_dim)) sent_vec = sent_vec.reshape((x.shape[0], x.shape[1], sent_vec_dim)) dropout_doc_vec = dropout_sent_conv.output.flatten(2) doc_vec = sent_conv.output.flatten(2) doc_vec_dim = num_maps_sentence * k_max_sentence # concatenate the doc vec along with the sentence vector con_dropout_sent_vec = T.concatenate([ dropout_sent_vec, T.tile(dropout_doc_vec, [1, x.shape[1]]).reshape( (x.shape[0], x.shape[1], doc_vec_dim)) ], axis=2).reshape( (x.shape[0] * x.shape[1], sent_vec_dim + doc_vec_dim)) con_sent_vec = T.concatenate([ sent_vec, T.tile(doc_vec, [1, x.shape[1]]).reshape( (x.shape[0], x.shape[1], doc_vec_dim)) ], axis=2).reshape( (x.shape[0] * x.shape[1], sent_vec_dim + doc_vec_dim)) # construct sentence level classifier n_in = sent_vec_dim + doc_vec_dim n_out = 1 sen_W_values = np.zeros((n_in, n_out), dtype=theano.config.floatX) sen_W = theano.shared(value=sen_W_values, borrow=True, name="logis_W") sen_b_value = nn.as_floatX(0.0) sen_b = theano.shared(value=sen_b_value, borrow=True, name="logis_b") drop_sent_prob = T.nnet.sigmoid( T.dot(con_dropout_sent_vec, sen_W) + sen_b) sent_prob = T.nnet.sigmoid(T.dot(con_sent_vec, sen_W) + sen_b) # reform the sent vec to doc level drop_sent_prob = drop_sent_prob.reshape((x.shape[0], x.shape[1])) sent_prob = sent_prob.reshape((x.shape[0], x.shape[1])) # using the dynamic top k max probability as bag level probability # compute the dynamic K for each documents drop_doc_prob = T.sum(T.sort(drop_sent_prob, axis=1) * sen_k, axis=1) / T.sum(sen_k, axis=1) doc_prob = T.sum(T.sort(sent_prob, axis=1) * sen_k, axis=1) / T.sum( sen_k, axis=1) drop_doc_prob = T.clip(drop_doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7)) doc_prob = T.clip(doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7)) doc_preds = doc_prob > 0.5 # instance level cost drop_sent_cost = T.sum( T.maximum( 0.0, nn.as_floatX(.5) - T.sgn( drop_sent_prob.reshape((x.shape[0] * x.shape[1], n_out)) - nn.as_floatX(0.6)) * T.dot(con_dropout_sent_vec, sen_W)) * sen_flags.reshape( (x.shape[0] * x.shape[1], n_out))) / T.sum(sen_flags) # we need that the most positive instance at least 0.7 in pos bags # and at most 0.1 in neg bags # we want the number of positive instance should at least ... # and non of the positive instances in the negative bags # compute the number of positive instance positive_count = T.sum((drop_sent_prob * sen_flags) > 0.5, axis=1) pos_cost = T.maximum(nn.as_floatX(0.0), positive_count - T.sum(sen_k, axis=1)) neg_cost = T.maximum(nn.as_floatX(0.0), positive_count) penal_cost = T.mean(pos_cost * y + neg_cost * (nn.as_floatX(1.0) - y)) # add the sentence similarity constrains sen_sen = T.dot(con_dropout_sent_vec, con_dropout_sent_vec.T) sen_sqr = T.sum(con_dropout_sent_vec**2, axis=1) sen_sqr_left = sen_sqr.dimshuffle(0, 'x') sen_sqr_right = sen_sqr.dimshuffle('x', 0) sen_sim_matrix = sen_sqr_left - 2 * sen_sen + sen_sqr_right sen_sim_matrix = T.exp(-1 * sen_sim_matrix) sen_sim_prob = drop_sent_prob.reshape( (x.shape[0] * x.shape[1], 1)) - drop_sent_prob.flatten() sen_sim_prob = sen_sim_prob**2 pos_sen_flags = sen_flags * y.dimshuffle(0, 'x') sen_sim_flag = T.dot( pos_sen_flags.reshape((x.shape[0] * x.shape[1], 1)), pos_sen_flags.reshape((1, x.shape[0] * x.shape[1]))) sen_sim_cost = T.sum( sen_sim_matrix * sen_sim_prob * sen_sim_flag) / T.sum(sen_sim_flag) # bag level cost drop_bag_cost = T.mean(-y * T.log(drop_doc_prob) * nn.as_floatX(0.6) - (1 - y) * T.log(1 - drop_doc_prob) * nn.as_floatX(0.4)) drop_cost = drop_bag_cost * nn.as_floatX(0.6) + \ drop_sent_cost * nn.as_floatX(0.1) + \ penal_cost * nn.as_floatX(0.5) + \ sen_sim_cost * nn.as_floatX(0.0001) # collect parameters self.params.append(words) self.params += dropout_word_conv.params self.params += dropout_sent_conv.params self.params.append(sen_W) self.params.append(sen_b) grad_updates = nn.sgd_updates_adadelta(self.params, drop_cost, rho, epsilon, norm_lim) # construct the dataset # random the train_x, train_y = nn.shared_dataset(dataset[0]) test_x, test_y = nn.shared_dataset(dataset[1]) test_cpu_y = dataset[1][1] n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) # construt the model index = T.iscalar() train_func = theano.function( [index], [ drop_cost, drop_bag_cost, drop_sent_cost, penal_cost, sen_sim_cost ], updates=grad_updates, givens={ x: train_x[index * batch_size:(index + 1) * batch_size], y: train_y[index * batch_size:(index + 1) * batch_size], sen_flags: train_flags[index * batch_size:(index + 1) * batch_size], sen_k: train_k[index * batch_size:(index + 1) * batch_size] }) test_func = theano.function( [index], doc_preds, givens={ x: test_x[index * batch_size:(index + 1) * batch_size], sen_k: test_k[index * batch_size:(index + 1) * batch_size] }) get_train_sent_prob = theano.function( [index], sent_prob, givens={x: train_x[index * batch_size:(index + 1) * batch_size]}) get_test_sent_prob = theano.function( [index], sent_prob, givens={x: test_x[index * batch_size:(index + 1) * batch_size]}) epoch = 0 best_score = 0 log_file = open("./log/%s.log" % exp_name, 'w') while epoch <= max_iteration: start_time = timeit.default_timer() epoch += 1 costs = [] for mini_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(mini_index) costs.append(cost_epoch) set_zero(zero_vec) total_train_cost, train_bag_cost, train_sent_cost, train_penal_cost, train_sim_cost = zip( *costs) print "Iteration %d, total_cost %f bag_cost %f sent_cost %f penal_cost %f sim cost %f\n" % ( epoch, np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost), np.mean(train_sim_cost)) if epoch % 1 == 0: test_preds = [] for i in xrange(n_test_batches): test_y_pred = test_func(i) test_preds.append(test_y_pred) test_preds = np.concatenate(test_preds) test_score = 1 - np.mean(np.not_equal(test_cpu_y, test_preds)) precision, recall, beta, support = precision_recall_fscore_support( test_cpu_y, test_preds, pos_label=1) if beta[1] > best_score or epoch % 5 == 0: best_score = beta[1] # save the sentence vectors train_sens = [ get_train_sent_prob(i) for i in range(n_train_batches) ] test_sens = [ get_test_sent_prob(i) for i in range(n_test_batches) ] train_sens = np.concatenate(train_sens, axis=0) test_sens = np.concatenate(test_sens, axis=0) out_train_sent_file = "./results/%s_train_sent_%d.vec" % ( exp_name, epoch) out_test_sent_file = "./results/%s_test_sent_%d.vec" % ( exp_name, epoch) with open(out_test_sent_file, 'w') as test_f, open(out_train_sent_file, 'w') as train_f: cPickle.dump(train_sens, train_f) cPickle.dump(test_sens, test_f) print "Get best performace at %d iteration %f" % ( epoch, test_score) log_file.write( "Get best performance at %d iteration %f \n" % (epoch, test_score)) end_time = timeit.default_timer() print "Iteration %d , precision, recall, f1" % epoch, precision, recall, beta log_file.write( "Iteration %d, neg precision %f, pos precision %f, neg recall %f pos recall %f , neg f1 %f, pos f1 %f, total_cost %f bag_cost %f sent_cost %f penal_cost %f\n" % (epoch, precision[0], precision[1], recall[0], recall[1], beta[0], beta[1], np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost))) print "Using time %f m" % ((end_time - start_time) / 60.) log_file.write("Uing time %f m\n" % ((end_time - start_time) / 60.)) end_time = timeit.default_timer() print "Iteration %d Using time %f m" % (epoch, (end_time - start_time) / 60.) log_file.write("Uing time %f m\n" % ((end_time - start_time) / 60.)) log_file.flush() log_file.close()
def _form1hot(self, hot_x, idx_x, cutoff_x): update_x = T.set_subtensor(hot_x[idx_x[:cutoff_x]], 1.0) return update_x
def set_subtensor(subtensor, newval): return T.set_subtensor(subtensor, newval)
def __init__(self, data, hp): super(Vae1, self).__init__(self.__class__.__name__, data, hp) self.n_h = 800 self.n_z = 20 self.n_t = 1 self.gaussian = False self.params = Parameters() n_x = self.data['n_x'] n_h = self.n_h n_z = self.n_z n_t = self.n_t scale = hp.init_scale if hp.load_model and os.path.isfile(self.filename): self.params.load(self.filename) else: with self.params: W1 = shared_normal((n_x, n_h), scale=scale) W11 = shared_normal((n_h, n_h), scale=scale) W111 = shared_normal((n_h, n_h), scale=scale) W2 = shared_normal((n_h, n_z), scale=scale) W3 = shared_normal((n_h, n_z), scale=scale) W4 = shared_normal((n_h, n_h), scale=scale) W44 = shared_normal((n_h, n_h), scale=scale) W444 = shared_normal((n_z, n_h), scale=scale) W5 = shared_normal((n_h, n_x), scale=scale) b1 = shared_zeros((n_h, )) b11 = shared_zeros((n_h, )) b111 = shared_zeros((n_h, )) b2 = shared_zeros((n_z, )) b3 = shared_zeros((n_z, )) b4 = shared_zeros((n_h, )) b44 = shared_zeros((n_h, )) b444 = shared_zeros((n_h, )) b5 = shared_zeros((n_x, )) def encoder(x, p): h_encoder = T.tanh(T.dot(x, p.W1) + p.b1) h_encoder2 = T.tanh(T.dot(h_encoder, p.W11) + p.b11) h_encoder3 = T.tanh(T.dot(h_encoder2, p.W111) + p.b111) mu_encoder = T.dot(h_encoder3, p.W2) + p.b2 log_sigma_encoder = 0.5 * (T.dot(h_encoder3, p.W3) + p.b3) log_qpz = -0.5 * T.sum(1 + 2 * log_sigma_encoder - mu_encoder**2 - T.exp(2 * log_sigma_encoder)) eps = srnd.normal(mu_encoder.shape, dtype=theano.config.floatX) z = mu_encoder + eps * T.exp(log_sigma_encoder) return z, log_qpz def decoder(z, p, x=None): h_decoder3 = T.tanh(T.dot(z, p.W444) + p.b444) h_decoder2 = T.tanh(T.dot(h_decoder3, p.W44) + p.b44) h_decoder = T.tanh(T.dot(h_decoder2, p.W4) + p.b4) if self.gaussian: pxz = T.tanh(T.dot(h_decoder, p.W5) + p.b5) else: pxz = T.nnet.sigmoid(T.dot(h_decoder, p.W5) + p.b5) if not x is None: if self.gaussian: log_sigma_decoder = 0 log_pxz = 0.5 * np.log( 2 * np.pi) + log_sigma_decoder + 0.5 * T.sum( T.sqr(x - pxz)) else: log_pxz = T.nnet.binary_crossentropy(pxz, x).sum() return pxz, log_pxz else: return pxz x = binomial(self.X) z, log_qpz = encoder(x, self.params) pxz, log_pxz = decoder(z, self.params, x) cost = log_pxz + log_qpz s_pxz = decoder(self.Z, self.params) a_pxz = T.zeros((self.n_t, s_pxz.shape[0], s_pxz.shape[1])) a_pxz = T.set_subtensor(a_pxz[0, :, :], s_pxz) self.compile(log_pxz, log_qpz, cost, a_pxz)
T = df.values[:, 0].astype(np.float32) Y = df.values[:, 1].astype(np.float32) n_times = len(df["X"].unique()) basic_model = Model() #subtensorの使い方↓ #http://deeplearning.net/software/theano/library/tensor/basic.html with basic_model: #事前分布 s_mu = HalfNormal('s_mu', sd=100) #隣接時刻の状態の誤差 s_Y = HalfNormal('s_Y', sd=100) #各時刻における状態と観測の誤差 mu_0 = Normal('mu_0', mu=0, sd=100) #t=0初期状態 mu_1 = Normal('mu_1', mu=0, sd=100) #t=1初期状態 #誤差項 e_mu = Normal('e_mu', mu=0, sd=s_mu, shape=n_times - 2) mu = tt.zeros((n_times)) mu = tt.set_subtensor(mu[0], mu_0) mu = tt.set_subtensor(mu[1], mu_1) for i in range(n_times - 2): mu = tt.set_subtensor(mu[i + 2], 2 * mu[i + 1] - mu[i] + e_mu[i]) #likelihood Y_obs = Normal('Y_obs', mu=mu, sd=s_Y, observed=Y) #サンプリング trace = sample(1000) summary(trace)
def get_elementwise_objective(Qvalues, actions, rewards, is_alive="always", gamma_or_gammas=0.95, crop_last=True, force_qvalues_after_end=True, qvalues_after_end="zeros", consider_reference_constant=True, ): """ Returns squared error between predicted and reference Qvalues according to Q-learning algorithm Qreference(state,action) = reward(state,action) + gamma* Q(next_state,next_action) loss = mean over (Qvalues - Qreference)**2 parameters: Qvalues [batch,tick,action_id] - predicted qvalues actions [batch,tick] - commited actions rewards [batch,tick] - immediate rewards for taking actions at given time ticks is_alive [batch,tick] - whether given session is still active at given tick. Defaults to always active. Default value of is_alive implies a simplified computation algorithm for Qlearning loss gamma_or_gammas - a single value or array[batch,tick](can broadcast dimensions) of delayed reward discounts crop_last - if True, zeros-out loss at final tick, if False - computes loss VS Qvalues_after_end force_qvalues_after_end - if true, sets reference Qvalues at session end to rewards[end] + qvalues_after_end qvalues_after_end [batch,1,n_actions] - symbolic expression for "next state q-values" for last tick used for reference only. Defaults at T.zeros_like(Qvalues[:,0,None,:]) If you wish to simply ignore the last tick, use defaults and crop output's last tick ( qref[:,:-1] ) consider_reference_constant - whether or not zero-out gradient flow through reference_Qvalues (True is highly recommended) Returns: tensor [batch, tick] of squared errors over Qvalues (using formula above for loss) """ # get reference Qvalues via Q-learning algorithm reference_Qvalues = get_reference_Qvalues(Qvalues, actions, rewards, gamma_or_gammas=gamma_or_gammas, qvalues_after_end=qvalues_after_end, ) if consider_reference_constant: # do not pass gradient through reference Q-values (since they DO depend on Q-values by default) reference_Qvalues = consider_constant(reference_Qvalues) # get predicted qvalues for committed actions (to compare with reference Q-values) action_Qvalues = get_action_Qvalues(Qvalues, actions) # if agent is always alive, return the simplified loss if is_alive == "always": # tensor of element-wise squared errors elwise_squared_error = squared_error(reference_Qvalues, action_Qvalues) else: # we are given an is_alive matrix : uint8[batch,tick] # if asked to force reference_Q[end_tick+1,a] = 0, do it # note: if agent is always alive, this is meaningless if force_qvalues_after_end: # set future rewards at session end to rewards + qvalues_after_end end_ids = get_end_indicator(is_alive, force_end_at_t_max=True).nonzero() if qvalues_after_end == "zeros": # "set reference Q-values at end action ids to just the immediate rewards" reference_Qvalues = T.set_subtensor(reference_Qvalues[end_ids], rewards[end_ids]) else: last_optimal_rewards = T.zeros_like(rewards[:, 0]) # "set reference Q-values at end action ids to the immediate rewards + qvalues after end" reference_Qvalues = T.set_subtensor(reference_Qvalues[end_ids], rewards[end_ids] + gamma_or_gammas * last_optimal_rewards[ end_ids[0], 0] ) # tensor of element-wise squared errors elwise_squared_error = squared_error(reference_Qvalues, action_Qvalues) # zero-out loss after session ended elwise_squared_error = elwise_squared_error * is_alive if crop_last: elwise_squared_error = T.set_subtensor(elwise_squared_error[:,-1],0) return elwise_squared_error
def __init__(self, modality_names, modality_sizes, locallayer_sizes, fusionlayer_sizes, numpy_rng, batchsize, theano_rng=None): assert (len(modality_names) == len(modality_sizes) and len(modality_names) == len(locallayer_sizes)) self.modality_names = modality_names self.modality_sizes = modality_sizes self.locallayer_sizes = locallayer_sizes self.fusionlayer_sizes = fusionlayer_sizes self.numpy_rng = numpy_rng self.batchsize = batchsize if theano_rng is None: theano_rng = RandomStreams(1) self.theano_rng = theano_rng self.mode = theano.shared(np.int8(0), name='mode') # start with empty params list self.params = [] self.l1params = [] self.l2params = [] self.l21params = [] # inputs are the concatenated modalities self.inputs = T.fmatrix('inputs') # targets vector self.targets = T.ivector('targets') self.modality_inputs = OrderedDict() self.modality_models = OrderedDict() self.modality_preconcat_layer_sizes = [] self.modality_concat_layer_sizes = [] offset = 0 # local modality networks for modality_name, modality_size, locallayer_size in zip( modality_names, modality_sizes, locallayer_sizes): # get inputs of modality self.modality_inputs[modality_name] = self.inputs[:, offset:offset + modality_size] offset += modality_size # determine size of input to the last layer in the modalities subnetwork if len(locallayer_size) == 1: self.modality_preconcat_layer_sizes.append(modality_size) else: self.modality_preconcat_layer_sizes.append(locallayer_size[-2]) # construct modality model layers = [] #locallayer_sizes = ((100,), (100,200)) #locallayer_size = (100,) for i, size in enumerate(locallayer_size): if i == 0: layer_input = self.modality_inputs[modality_name] layer_input_size = (self.batchsize, modality_size) else: layer_input = layers[-1] layer_input_size = layer_input.outputs_shape layers.append( AffineLayer(rng=self.numpy_rng, inputs=layer_input, nouts=size, name='{0}_affine_{1}'.format(modality_name, i), inputs_shape=layer_input_size)) # append params to global list self.params.extend(layers[-1].params) self.l2params.append(layers[-1].W) if i == len(locallayer_size) - 1: self.l1params.append(layers[-1].W) self.l21params.append(layers[-1].W) # update total size of concat layer self.modality_concat_layer_sizes.append(size) layers.append( RectifiedTanh(inputs=layers[-1], name='{0}_rectifiedtanh_{1}'.format( modality_name, i))) # create the modality model object self.modality_models[modality_name] = Composite( layers=layers, name='{0}_composite'.format(modality_name)) # concatenate modality model outputs self.concat_modalities = Concat(self.modality_models.values(), name='concat_layer', axis=1) self.fusion_layers = [] for i, fusionlayer_size in enumerate(fusionlayer_sizes): if i == 0: layer_input = self.concat_modalities else: layer_input = self.fusion_layers[-1] self.fusion_layers.append( AffineLayer(rng=self.numpy_rng, inputs=layer_input, nouts=fusionlayer_size, name='fusion_affine_{0}'.format(i))) # append params to global list self.params.extend(self.fusion_layers[-1].params) self.l2params.append(self.fusion_layers[-1].W) self.fusion_layers.append( RectifiedTanh(inputs=self.fusion_layers[-1], name='fusion_rectifiedtanh_{0}'.format(i))) self.fusion_layers.append( Dropout(inputs=self.fusion_layers[-1], dropout_rate=.3, name='fusion_dropout_{0}'.format(i), theano_rng=self.theano_rng, mode_var=self.mode)) # classification layer self.logits = AffineLayer(rng=self.numpy_rng, inputs=self.fusion_layers[-1], nouts=7, name='logit_affine') # append params to global list self.params.extend(self.logits.params) self.l2params.append(self.logits.W) self.softmax = Softmax(inputs=self.logits, name='softmax') self.probabilities = self.softmax.outputs self.probabilities = T.clip(self.probabilities, 1e-6, 1 - 1e-6) self.l2cost = L2_sqr( T.concatenate([x.flatten() for x in self.l2params], axis=0)) self.concat_matrix = T.zeros( (np.sum(self.modality_preconcat_layer_sizes), np.sum(self.modality_concat_layer_sizes))) row_offset = 0 col_offset = 0 for inp_size, outp_size, p in zip(self.modality_preconcat_layer_sizes, self.modality_concat_layer_sizes, self.l1params): # embed weight matrices in large concatenated matrix self.concat_matrix = T.set_subtensor( self.concat_matrix[row_offset:row_offset + inp_size, col_offset:col_offset + outp_size], p) self.l1cost = L11(self.concat_matrix) self.l21cost = L21(self.concat_matrix) self._cost = (T.nnet.categorical_crossentropy( self.probabilities, self.targets).mean() + 3e-5 * (self.l2cost + self.l1cost + self.l21cost)) self.classification = T.argmax(self.probabilities, axis=1) self._grads = T.grad(self._cost, self.params) self._classify = theano.function([self.inputs], self.classification) self._get_probabilities = theano.function([self.inputs], self.probabilities)
def change_race_prob_div(_i, _change, _rep, _times, _item): _change = T.set_subtensor( _change[_rep[_i]:_rep[_i + 1]], T.reshape(T.alloc(_item[_i], _times[_i]), (_times[_i], 1))) return _change
def feedForward(self, miniBatchSize): ''' Perform Convolution operation on the output of 'fromLayer' Remember output of any layer is always flattened, so first need to reshape it w.r.t to input_shape CURRENTLY SUPPORTS ONLY 1-D Convolution and 2-D Convolution :param minibatchSize: :return: ''' ### Reshape according to 'input_shape' self.input = self.fromLayer.output.reshape( self.fromLayer.shape_with_minibatch) '''insert minibatchsize value also in the input_shape variable, since that will be the complete shape of incoming data''' inp = list(self.input_shape) inp.insert(0, miniBatchSize) self.input_shape = list(inp) ### Add zero Pads if any if self.zero_padding != 0: if len(self.input_shape) == 4: zero_padding = T.zeros( (self.input_shape[0], self.input_shape[1], self.input_shape[2] + 2 * self.zero_padding, self.input_shape[3] + 2 * self.zero_padding), dtype=theano.config.floatX) zero_padding = T.set_subtensor( zero_padding[:, :, self.zero_padding:self.input_shape[2] + self.zero_padding, self.zero_padding:self.input_shape[3] + self.zero_padding], self.input) self.input = zero_padding input_shape = list(self.input_shape) input_shape[2] = input_shape[2] + 2 * self.zero_padding input_shape[3] = input_shape[3] + 2 * self.zero_padding self.input_shape = tuple(input_shape) elif len(self.input_shape) == 3: zero_padding = T.zeros( (self.input_shape[0], self.input_shape[1], self.input_shape[2] + 2 * self.zero_padding), dtype=theano.config.floatX) zero_padding = T.set_subtensor( zero_padding[:, :, self.zero_padding:self.input_shape[2] + self.zero_padding], self.input) self.input = zero_padding input_shape = list(self.input_shape) input_shape[2] = input_shape[2] + 2 * self.zero_padding self.input_shape = tuple(input_shape) conv_out = conv.conv2d(input=self.input, filters=self.w, filter_shape=self.filter_shape, image_shape=self.input_shape, border_mode="valid", subsample=self.stride_length) self.output = None if len(self.input_shape) == 4: self.output = conv_out + self.b.dimshuffle('x', 0, 'x', 'x') else: self.output = conv_out + self.b.dimshuffle('x', 0, 'x') self.output = self.output.reshape( self.toLayer.shape_minibatch_flattened)
def train_conv_net(datasets, U, img_w=300, filter_hs=[3, 4, 5], hidden_units=[100, 2], dropout_rate=[0.5], shuffle_batch=True, n_epochs=25, batch_size=10, lr_decay=0.95, conv_non_linear="relu", activations=[Iden], sqr_norm_lim=9, non_static=True): """ Train a simple conv net img_h = sentence length (padded where necessary) img_w = word vector length (300 for word2vec) filter_hs = filter window sizes hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer sqr_norm_lim = s^2 in the paper lr_decay = adadelta decay parameter """ rng = np.random.RandomState(3435) img_h = len(datasets[0][0]) - 1 filter_w = img_w feature_maps = hidden_units[0] filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) parameters = [("image shape", img_h, img_w), ("filter shape", filter_shapes), ("hidden_units", hidden_units), ("dropout", dropout_rate), ("batch_size", batch_size), ("non_static", non_static), ("learn_decay", lr_decay), ("conv_non_linear", conv_non_linear), ("non_static", non_static), ("sqr_norm_lim", sqr_norm_lim), ("shuffle_batch", shuffle_batch)] print parameters #define model architecture index = T.lscalar() x = T.matrix('x') y = T.ivector('y') Words = theano.shared(value=U, name="Words") zero_vec_tensor = T.vector() zero_vec = np.zeros(img_w) set_zero = theano.function([zero_vec_tensor], updates=[ (Words, T.set_subtensor(Words[0, :], zero_vec_tensor)) ], allow_input_downcast=True) layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0], 1, x.shape[1], Words.shape[1])) conv_layers = [] layer1_inputs = [] for i in xrange(len(filter_hs)): filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, img_h, img_w), filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) layer1_input = conv_layer.output.flatten(2) conv_layers.append(conv_layer) layer1_inputs.append(layer1_input) layer1_input = T.concatenate(layer1_inputs, 1) hidden_units[0] = feature_maps * len(filter_hs) classifier = MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, activations=activations, dropout_rates=dropout_rate) #define parameters of the model and update functions using adadelta params = classifier.params for conv_layer in conv_layers: params += conv_layer.params if non_static: #if word vectors are allowed to change, add them as model parameters params += [Words] cost = classifier.negative_log_likelihood(y) dropout_cost = classifier.dropout_negative_log_likelihood(y) grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) #shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate #extra data (at random) np.random.seed(3435) if datasets[0].shape[0] % batch_size > 0: extra_data_num = batch_size - datasets[0].shape[0] % batch_size train_set = np.random.permutation(datasets[0]) extra_data = train_set[:extra_data_num] new_data = np.append(datasets[0], extra_data, axis=0) else: new_data = datasets[0] new_data = np.random.permutation(new_data) n_batches = new_data.shape[0] / batch_size n_train_batches = int(np.round(n_batches * 0.9)) #divide train set into train/val sets test_set_x = datasets[1][:, :img_h] test_set_y = np.asarray(datasets[1][:, -1], "int32") train_set = new_data[:n_train_batches * batch_size, :] val_set = new_data[n_train_batches * batch_size:, :] train_set_x, train_set_y = shared_dataset( (train_set[:, :img_h], train_set[:, -1])) val_set_x, val_set_y = shared_dataset((val_set[:, :img_h], val_set[:, -1])) n_val_batches = n_batches - n_train_batches val_model = theano.function( [index], classifier.errors(y), givens={ x: val_set_x[index * batch_size:(index + 1) * batch_size], y: val_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) #compile theano functions to get train/val/test errors test_model = theano.function( [index], classifier.errors(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) train_model = theano.function( [index], cost, updates=grad_updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=False) test_pred_layers = [] test_size = test_set_x.shape[0] test_layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape( (test_size, 1, img_h, Words.shape[1])) for conv_layer in conv_layers: test_layer0_output = conv_layer.predict(test_layer0_input, test_size) test_pred_layers.append(test_layer0_output.flatten(2)) test_layer1_input = T.concatenate(test_pred_layers, 1) test_y_pred = classifier.predict(test_layer1_input) test_error = T.mean(T.neq(test_y_pred, y)) test_model_all = theano.function([x, y], test_error, allow_input_downcast=True) #start training over mini-batches print '... training' epoch = 0 best_val_perf = 0 val_perf = 0 test_perf = 0 cost_epoch = 0 while (epoch < n_epochs): start_time = time.time() epoch = epoch + 1 if shuffle_batch: for minibatch_index in np.random.permutation( range(n_train_batches)): cost_epoch = train_model(minibatch_index) set_zero(zero_vec) else: for minibatch_index in xrange(n_train_batches): cost_epoch = train_model(minibatch_index) set_zero(zero_vec) train_losses = [test_model(i) for i in xrange(n_train_batches)] train_perf = 1 - np.mean(train_losses) val_losses = [val_model(i) for i in xrange(n_val_batches)] val_perf = 1 - np.mean(val_losses) print( 'epoch: %i, training time: %.2f secs, train perf: %.2f %%, val perf: %.2f %%' % (epoch, time.time() - start_time, train_perf * 100., val_perf * 100.)) if val_perf >= best_val_perf: best_val_perf = val_perf test_loss = test_model_all(test_set_x, test_set_y) test_perf = 1 - test_loss return test_perf