def forward_prop_step(x_t, y_t, s_t_prev): # Word embedding layer x_e = E_x[:, x_t] y_e = E_y[:, y_t] def GRU(i, U, W, b, x_0, s_prev): pb = printing.Print('b') b1 = T.specify_shape((self.conversion_ones * b[i * 3, :]).T, T.shape(x_0)) b2 = T.specify_shape((self.conversion_ones * b[i * 3 + 1, :]).T, T.shape(x_0)) b3 = T.specify_shape((self.conversion_ones * b[i * 3 + 2, :]).T, T.shape(x_0)) z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_prev) + b1) r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_prev) + b2) c = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_prev * r) + b3) return (T.ones_like(z) - z) * c + z * s_prev p_o = printing.Print('juju') # GRU Layer 1 s[0] = GRU(0, U, W, b, x_e, s_t_prev[0]) # GRU Layer 2 s[1] = GRU(1, U, W, b, y_e, s_t_prev[1]) c_matrix = (self.conversion_ones * c).T juju = V.dot(s) + c_matrix ot = printing.Print("o_t") o_t = T.nnet.softmax(juju.T).T return [o_t, s]
def forward_prop_step(x_t, s_t1_prev, s_t2_prev, s_t3_prev): p_o = printing.Print('juju') # Word embedding layer x_e = x_t.dot(E).T + v def GRU(i, U, W, b, x_0, s_prev): b1 = b[i * 3, :] b2 = b[i * 3 + 1, :] b3 = b[i * 3 + 2, :] z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_prev) + b1) r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_prev) + b2) c = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_prev * r) + b3) return ((T.ones_like(z) - z) * c + z * s_prev).astype(theano.config.floatX) p_o = printing.Print('juju') s = [[], [], []] # GRU Layer 1 s[0] = GRU(0, U, W, b, x_e, s_t1_prev) # GRU Layer 2 s[1] = GRU(1, U, W, b, s[0], s_t2_prev) # GRU Layer 3 s[2] = GRU(2, U, W, b, s[1], s_t3_prev) # Final output calculation o_t = (V.dot(s[2]) + c)[0] return [o_t, s[0], s[1], s[2]]
def _ot_matching(q1_x, q1_mu, xt_x, xt_mu, radius) : """ Given two measures q1 and xt represented by locations/weights arrays, outputs an optimal transport fidelity term and the transport plan. """ # The Sinkhorn algorithm takes as input three Theano variables : c = _squared_distances(q1_x, xt_x) # Wasserstein cost function mu = q1_mu ; nu = xt_mu # Parameters of the Sinkhorn algorithm. epsilon = (.02)**2 # regularization parameter rho = (.5) **2 # unbalanced transport (See PhD Th. of Lenaic Chizat) niter = 10000 # max niter in the sinkhorn loop tau = -.8 # nesterov-like acceleration lam = rho / (rho + epsilon) # Update exponent # Elementary operations ..................................................................... def ave(u,u1) : "Barycenter subroutine, used by kinetic acceleration through extrapolation." return tau * u + (1-tau) * u1 def M(u,v) : "$M_{ij} = (-c_{ij} + u_i + v_j) / \epsilon$" return (-c + u.dimshuffle(0,'x') + v.dimshuffle('x',0)) / epsilon lse = lambda A : T.log(T.sum( T.exp(A), axis=1 ) + 1e-6) # slight modif to prevent NaN # Actual Sinkhorn loop ...................................................................... # Iteration step : def sinkhorn_step(u, v, foo) : u1=u # useful to check the update u = ave( u, lam * ( epsilon * ( T.log(mu) - lse(M(u,v)) ) + u ) ) v = ave( v, lam * ( epsilon * ( T.log(nu) - lse(M(u,v).T) ) + v ) ) err = T.sum(abs(u - u1)) return (u,v,err), theano.scan_module.until(err < 1e-4) # "break" the loop if error < tol # Scan = "For loop" : err0 = np.arange(1, dtype=config.floatX)[0] result, updates = theano.scan( fn = sinkhorn_step, # Iterated routine outputs_info = [(0.*mu), (0.*nu), err0], # Starting estimates n_steps = niter # Number of iterations ) U, V = result[0][-1], result[1][-1] # We only keep the final dual variables Gamma = T.exp( M(U,V) ) # Eventual transport plan g = diag(a)*K*diag(b) cost = T.sum( Gamma * c ) # Simplistic cost, chosen for readability in this tutorial if False : print_err_shape = printing.Print('error : ', attrs=['shape']) errors = print_err_shape(result[2]) print_err = printing.Print('error : ') ; err_fin = print_err(errors[-1]) cost += .00000001 * err_fin # hack to prevent the pruning of the error-printing node... return [cost, Gamma]
def localConv(doc, dsn, swnv, dww, sww): # t = T.arange(docSentenceSize) # ccc = docs[t.nonzero()] t = T.arange(dsn).nonzero() t = (T.arange(10000) < dsn).nonzero() # print t # t=T.arange(dsn) docSub = doc[t] p = printing.Print('docSub') docSub = p(docSub) swnvSub = swnv[t] def sentenceConv(sen, wn, sww): t = (T.arange(10000) < wn).nonzero() senSub = sen[t] convRes = theano.tensor.signal.conv.conv2d(senSub, sww) sentence_pool = theano.tensor.signal.downsample.max_pool_2d( convRes, (100000, 1)).flatten(1) return sentence_pool sentenceLayer, _ = theano.scan( fn=lambda sen, wn, sww: sentenceConv(sen, wn, sww), non_sequences=[sww], sequences=[docSub, swnvSub]) convRes = theano.tensor.signal.conv.conv2d(sentenceLayer, dww) sentence_pool = theano.tensor.signal.downsample.max_pool_2d( convRes, (100000, 1)).flatten(1) return sentence_pool
def get_cost_updates(self, corruption_level, learning_rate): """ This function computes the cost and the updates for one trainng step of the dA """ tilde_x = self.get_corrupted_input(self.x, corruption_level) y = self.get_hidden_values2(tilde_x) z = self.get_reconstructed_input(y) # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, with one entry per # example in minibatch diff = (self.x - z) * (self.x - z) #print diff.eval() L = T.sum(diff, axis=1) printing.Print(L) # L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) # note : L is now a vector, where each element is the # cross-entropy cost of the reconstruction of the # corresponding example of the minibatch. We need to # compute the average of all these to get the cost of # the minibatch cost = T.mean(L) # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [(param, param - learning_rate * gparam) for param, gparam in zip(self.params, gparams)] return (cost, updates)
def forward_prop_step(x_t, s_t1_prev, s_t2_prev, s_t3_prev): p_o = printing.Print('juju') # Word embedding layer x_t = T.reshape(x_t, (T.shape(x_t)[0], 1)) x_e = x_t.dot(E).T + coversion_ones.v def GRU(i, U, W, b, x_0, s_prev): coversion_ones = T.ones((1, self.mini_batch_dim)) b1 = T.reshape(b[i * 3, :], (T.shape(b)[1], 1)).dot(coversion_ones) b2 = T.reshape(b[i * 3 + 1, :], (T.shape(b)[1], 1)).dot(coversion_ones) b3 = T.reshape(b[i * 3 + 2, :], (T.shape(b)[1], 1)).dot(coversion_ones) z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_prev) + b1) r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_prev) + b2) c = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_prev * r) + b3) return ((T.ones_like(z) - z) * c + z * s_prev).astype( theano.config.floatX) p_o = printing.Print('juju') s = [[], [], []] # GRU Layer 1 s[0] = GRU(0, U, W, b, x_e, s_t1_prev) # GRU Layer 2 s[1] = GRU(1, U, W, b, s[0], s_t2_prev) # GRU Layer 3 s[2] = GRU(2, U, W, b, s[1], s_t3_prev) # Final output calculation c_matrix = (coversion_ones.dot(c)).T o_t = ((V).dot(s[2]) + c_matrix)[0] return [o_t, s[0], s[1], s[2]]
def GRU(i, U, W, b, x_0, s_prev): pb = printing.Print('b') b1 = T.specify_shape((self.conversion_ones * b[i * 3, :]).T, T.shape(x_0)) b2 = T.specify_shape((self.conversion_ones * b[i * 3 + 1, :]).T, T.shape(x_0)) b3 = T.specify_shape((self.conversion_ones * b[i * 3 + 2, :]).T, T.shape(x_0)) z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_prev) + b1) r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_prev) + b2) c = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_prev * r) + b3) return (T.ones_like(z) - z) * c + z * s_prev
def masked_categorical_crossentropy(output, target, mask, from_logits=False): if from_logits: output = T.nnet.softmax(output) else: # scale preds so that the class probas of each sample sum to 1 output /= output.sum(axis=-1, keepdims=True) # avoid numerical instability with _EPSILON clipping output = T.clip(output, _EPSILON, 1.0 - _EPSILON) objective = -T.sum(target * T.log(output), axis=output.ndim - 1) objective = T.set_subtensor( objective[T.or_(T.eq(target[:, :, mask], 1), T.eq(target[:, :, 0], 1)).nonzero()], 0.0) return printing.Print('Objective', global_fn=_debug_fn)(objective)
def put_hook(variable, hook_fn): """Put a hook on a Theano variables. Ensures that the hook function is executed every time when the value of the Theano variable is available. Parameters ---------- variable : :class:`~tensor.TensorVariable` The variable to put a hook on. hook_fn : function The hook function. Should take a single argument: the variable's value. """ return printing.Print(global_fn=lambda _, x: hook_fn(x))(variable)
def print_tensor(message, variable): """A small helper function that makes printing Theano variables a little bit easier. :type message: str :param message: message, typically the variable name :type variable: TensorVariable :param variable: any tensor variable to be printed :rtype: TensorVariable :returns: a tensor variable to be used further down the graph in place of ``variable`` """ print_op = printing.Print(message) return print_op(variable)
def get_train_fn(self): X = T.dmatrix("X") Y = self.get_output_values(self.get_hidden_values(X)) self.f1 = theano.function([X], Y) advantage = T.dmatrix("advantage") loss = -T.sum(advantage * Y) #updates = self.get_cost_updates(X, loss) grad_params = T.grad(loss, self.params) param_printing_op = printing.Print("Param") param_printing = param_printing_op(grad_params[0]) updates = [(param, param - learning_rate * grad_param) for param, grad_param in zip(self.params, grad_params)] #updates = self.RMSprop(loss, self.params) self.f2 = theano.function([X, advantage], [param_printing], updates=updates)
def forward_prop_step(x_t, s_t1_prev, s_t2_prev, s_t3_prev): # Word embedding layer x_e = E[:, x_t] # def GRU(i, U, W, b, x_0, s_prev): # z = T.nnet.hard_sigmoid(x_0.dot(U[i * 3 + 0].T) + s_prev.dot(W[i * 3 + 0].T) + b[i * 3 + 0]) # r = T.nnet.hard_sigmoid(x_0.dot(U[i * 3 + 1].T) + s_prev.dot(W[i * 3 + 1].T) + b[i * 3 + 1]) # c = T.tanh(x_0.dot(U[i * 3 + 2].T) + (s_prev * r).dot(W[i * 3 + 2].T) + b[i * 3 + 2]) # return (T.ones_like(z) - z) * c + z * s_prev def GRU(i, U, W, b, x_0, s_prev): b1 = T.specify_shape((coversion_ones*b[i * 3,:]).T, T.shape(x_0)) b2 = T.specify_shape((coversion_ones*b[i * 3 + 1 ,:]).T, T.shape(x_0)) b3 = T.specify_shape((coversion_ones*b[i * 3 + 2,:]).T, T.shape(x_0)) z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_prev) + b1) r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_prev) + b2) c = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_prev * r) + b3) return (T.ones_like(z) - z) * c + z * s_prev p_o = printing.Print('juju') s = [s_t1_prev, s_t2_prev, s_t3_prev] # GRU Layer 1 s[0] = GRU(0, U, W, b, x_e, s_t1_prev) # GRU Layer 2 s[1] = GRU(1, U, W, b, s[0], s_t2_prev) # GRU Layer 3 s[2] = GRU(2, U, W, b, s[1], s_t3_prev) # Final output calculation # Theano's softmax returns a matrix with one row, we only need the row c_matrix = (coversion_ones * c).T juju = V.dot(s[2]) + c_matrix o_t = T.nnet.softmax(juju.T).T return [o_t, s[0], s[1], s[2]]
def model_layers(self, x, *args): """ Définie le model pour toutes les couches :param x: entrée :param args: liste des entrées (du temps précédent) pour les lstm """ args = list(args) num_arg = 0 outputs = [] num_passage = args[-1] for k in range(len(self.layers)): debug_printing = pr.Print('Progress', global_fn=self.print_callback)(num_passage+1) layer = self.layers[k] if layer['type'] == 'simple': # si c'est un simple on utilise le model simple x = self.model_simple_layer(x, k) elif layer['type'] == 'lstm': # sinon celui de lstm h, c = self.model_lstm_layer(x, args[num_arg], args[num_arg+1], k) x = h # la sortie est h num_arg += 2 outputs.append(h) # nouveau h outputs.append(c) # nouveau x outputs = [x] + outputs # les sorties sont la sortie finale x et les valeurs intermédiaires à repasser au réseau au temps suivant outputs.append(debug_printing) return tuple(outputs) # x (output), vals_t, ...
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pre_emb, crf, cap_dim, training=True, **kwargs ): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) # Number of capitalization features if cap_dim: n_cap = 4 # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: # Randomly generates new weights new_weights = word_layer.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb # Here is where we will substitute pyemblib read function. # Syntax: get_embedding_dict(emb_path, emb_format, first_n, vocab) emb_format = pyemblib2.Format.Word2Vec pretrained = get_embedding_dict(pre_emb, emb_format, 0, None) ''' pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]] ).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid ''' c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[ re.sub('\d', '0', word.lower()) ] c_zeros += 1 # This is it, this is what needs to be printed. # "word_layer.embeddings" is a "theano.shared" object word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained embeddings.' % len(pretrained) print ('%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words ) print ('%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % ( c_found, c_lower, c_zeros ) # # Chars inputs # if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[ T.arange(s_len), char_pos_ids ] char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[ T.arange(s_len), char_pos_ids ] inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim # # Capitalization feature # if cap_dim: input_dim += cap_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) # Prepare final input inputs = T.concatenate(inputs, axis=1) if len(inputs) != 1 else inputs[0] # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: final_output = T.concatenate( [word_for_output, word_rev_output], axis=1 ) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_for_output # Sentence to Named Entity tags - Score final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1 ) observations = T.concatenate( [b_s, observations, e_s], axis=0 ) # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Score from transitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[ padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1] ].sum() all_paths_scores = forward(observations, transitions) cost = - (real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) # Supposedly the commented-out line below will stop # the model from updating the pretrained emeddings. # params.extend(word_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(transitions) params.append(transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: # "params" supposedly contains the pretrained embedding matrix that we are updating. # Find the "get_updates" function and figure out what it does. updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function( inputs=train_inputs, outputs=cost, updates=updates, givens=({is_train: np.cast['int32'](1)} if dropout else {}) ) #======================================== # FUNCTION TO PRINT PRETRAINED EMBEDDINGS # The function below takes one argument, which it prints # along with the specified print message. print_matrix = T.dmatrix() print_op = printing.Print('print message') printed_x = print_op(print_matrix) f_print = function([print_matrix], printed_x) #======================================== else: f_train = None f_print = None # We return a tuple of things used to print the embedding so that it looks nicer. print_tuple = [f_print, word_layer.embeddings] # Compile evaluation function if not crf: f_eval = theano.function( inputs=eval_inputs, outputs=tags_scores, givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) else: f_eval = theano.function( inputs=eval_inputs, outputs=forward(observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) return f_train, f_eval, print_tuple
def p(self, j, name): return printing.Print(name)(j)
def printme(self, name, mat): return printing.Print('vector')(mat)
def printdim(self, name, mat): return printing.Print(name, attrs=['shape'])(mat)
def print_me(self, name, mat): mat = printing.Print('vector')(mat) return mat
def print_dim(self, name, mat): mat = printing.Print(name, attrs=['shape'])(mat) return mat
def print_shape(A): print_op = printf.Print('vector', attrs=['shape']) printed = print_op(A) f = function([A], printed) return f(A)
def __theano_build__(self): E_x, E_y, V, U, W, b, c = self.E_x, self.E_y, self.V, self.U, self.W, self.b, self.c x = T.ivector('x') y = T.ivector('y') x_label = T.ivector('x') y_label = T.ivector('y') def forward_prop_step(x_t, y_t, s_t_prev): # Word embedding layer x_e = E_x[:, x_t] y_e = E_y[:, y_t] def GRU(i, U, W, b, x_0, s_prev): pb = printing.Print('b') b1 = T.specify_shape((self.conversion_ones * b[i * 3, :]).T, T.shape(x_0)) b2 = T.specify_shape((self.conversion_ones * b[i * 3 + 1, :]).T, T.shape(x_0)) b3 = T.specify_shape((self.conversion_ones * b[i * 3 + 2, :]).T, T.shape(x_0)) z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_prev) + b1) r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_prev) + b2) c = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_prev * r) + b3) return (T.ones_like(z) - z) * c + z * s_prev p_o = printing.Print('juju') # GRU Layer 1 s[0] = GRU(0, U, W, b, x_e, s_t_prev[0]) # GRU Layer 2 s[1] = GRU(1, U, W, b, y_e, s_t_prev[1]) c_matrix = (self.conversion_ones * c).T juju = V.dot(s) + c_matrix ot = printing.Print("o_t") o_t = T.nnet.softmax(juju.T).T return [o_t, s] [o, s], updates = theano.scan( forward_prop_step, sequences=x, truncate_gradient=self.bptt_truncate, outputs_info=[None, dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim))]) prediction = T.argmax(o, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(o, y)) p_o = printing.Print('o_error') # Total cost (could add regularization here) cost = p_o(o_error) # Gradients dE_x = T.grad(cost, E_x) dE_y = T.grad(cost, E_y) dU = T.grad(cost, U) dW = T.grad(cost, W) db = T.grad(cost, b) dV = T.grad(cost, V) dc = T.grad(cost, c) # Assign functions self.predict = theano.function([x], [o], allow_input_downcast=True) self.predict_class = theano.function([x], prediction, allow_input_downcast=True) self.ce_error = theano.function([x, y], cost, allow_input_downcast=True) self.bptt = theano.function([x, y], [dE_x, dE_y, dU, dW, db, dV, dc], allow_input_downcast=True) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates mE_x = decay * self.mE_x + (1 - decay) * dE_x ** 2 mE_y = decay * self.mE_y + (1 - decay) * dE_y ** 2 mU = decay * self.mU + (1 - decay) * dU ** 2 mW = decay * self.mW + (1 - decay) * dW ** 2 mV = decay * self.mV + (1 - decay) * dV ** 2 mb = decay * self.mb + (1 - decay) * db ** 2 mc = decay * self.mc + (1 - decay) * dc ** 2 self.sgd_step = theano.function( [x, y, learning_rate, theano.In(decay, value=0.9)], [], updates=[(E_x, E_x - learning_rate * dE_x / T.sqrt(dE_x + 1e-6)), (E_y, E_y - learning_rate * dE_y / T.sqrt(dE_y + 1e-6)), (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)), (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)), (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)), (b, b - learning_rate * db / T.sqrt(mb + 1e-6)), (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)), (self.mE_x, mE_x), (self.mE_y, mE_y), (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb), (self.mc, mc) ], allow_input_downcast=True)
def create_gradientfunctions(self, train_data, train_labels, val_data, val_labels): """This function takes as input the whole dataset and creates the entire model""" def encodingstep(x_t, h_t): z_t = T.nnet.sigmoid( T.dot(x_t, self.params['U_z']) + T.dot(h_t, self.params['W_z']).squeeze() + self.params['b_z'].squeeze()) r_t = T.nnet.sigmoid( T.dot(x_t, self.params['U_r']) + T.dot(h_t, self.params['W_r']).squeeze() + self.params['b_r'].squeeze()) h = T.tanh( T.dot(x_t, self.params['U_h']) + T.dot(h_t * r_t, self.params['W_h']) + self.params['b_h'].squeeze()) new_h_t = (1 - z_t) * h + z_t * h_t return new_h_t x = T.tensor3("x") h0_enc = T.matrix("h0_enc") result, _ = theano.scan(encodingstep, sequences=x, outputs_info=h0_enc) h_encoder = result[-1] #log sigma encoder is squared mu_encoder = T.dot( h_encoder, self.params["W_hmu"]) + self.params["b_hmu"].squeeze() log_sigma_encoder = T.dot( h_encoder, self.params["W_hsigma"]) + self.params["b_hsigma"].squeeze() #Use a very wide prior to make it possible to learn something with Z #logpz = 0.005 * T.sum(1 + log_sigma_encoder - mu_encoder**2 - T.exp(log_sigma_encoder), axis = 1) logpz = 0.5 * T.sum( 1 + log_sigma_encoder - mu_encoder**2 - T.exp(log_sigma_encoder), axis=1) if "gpu" in theano.config.device: srng = theano.sandbox.cuda.rng_curand.CURAND_RandomStreams() else: srng = T.shared_randomstreams.RandomStreams() #Reparametrize Z eps = srng.normal((x.shape[1], self.latent_variables), avg=0.0, std=1.0, dtype=theano.config.floatX) z = mu_encoder + T.exp(0.5 * log_sigma_encoder) * eps h0_dec = T.tanh( T.dot(z, self.params["W_zh"]) + self.params["b_zh"].squeeze()) def decodingstep(x_t, h_t): z_dec_t = T.nnet.sigmoid( T.dot(x_t, self.params['U_dec_z']) + T.dot(h_t, self.params['W_dec_z']) + self.params['b_dec_z'].squeeze()) r_dec_t = T.nnet.sigmoid( T.dot(x_t, self.params['U_dec_r']) + T.dot(h_t, self.params['W_dec_r']) + self.params['b_dec_r'].squeeze()) h = T.tanh( T.dot(x_t, self.params['U_dec_h']) + T.dot(h_t * r_dec_t, self.params['W_dec_h']) + self.params['b_dec_h'].squeeze()) new_h_t = (1 - z_dec_t) * h + z_dec_t * h_t new_x_t = T.tanh( h.dot(self.params["W_hx"]) + self.params["b_hx"].squeeze()) return new_x_t, new_h_t x0 = T.matrix("x0") [y, _], _ = theano.scan(decodingstep, n_steps=x.shape[0], outputs_info=[x0, h0_dec]) # Clip y to avoid NaNs, necessary when lowerbound goes to 0 # 128 x 8 x 35 y = T.clip(y, -1 + 1e-6, 1 - 1e-6) logpxz = -T.sum(T.pow(y - x, 2), axis=0) logpxz = T.mean(logpxz, axis=1) #Average over batch dimension logpx = T.mean(logpxz + logpz) #Driver output batch_start = T.iscalar('batch_start') batch_end = T.iscalar('batch_end') labels = T.ivector('labels') train_labels = theano.shared(train_labels.astype('int32')) val_labels = theano.shared(val_labels.astype('int32')) keep_prob = T.scalar(dtype=theano.config.floatX) mask = self.srng.binomial(p=keep_prob, size=(self.hidden_units_encoder, )).astype( theano.config.floatX) / keep_prob printer = printing.Print('') driver_output = T.nnet.softmax( T.dot(h_encoder * mask, self.params['W_driver']) + self.params['b_driver'].squeeze()) max_minus_min = (driver_output.max(axis=0) - driver_output.min(axis=0)).sum() var = (driver_output.var(axis=0)).sum() mean = (driver_output.mean(axis=0)).sum() cross_entropy = T.nnet.categorical_crossentropy(driver_output, labels) driver_loss = (-T.mean(cross_entropy)) l1_loss = (-T.sum([T.sum(abs(v)) for v in self.params.values()])) l2_loss = (-T.sum([T.sum(v**2) for v in self.params.values()])) #Compute all the gradients total_loss = ((1 - self.lamda1) * logpx + self.lamda1 * driver_loss + self.lamda_l2 * l2_loss + self.lamda_l1 * l1_loss) gradients = T.grad(total_loss, self.params.values(), disconnected_inputs='ignore') #Let Theano handle the updates on parameters for speed updates = OrderedDict() epoch = T.iscalar("epoch") gamma = (T.sqrt(1 - (1 - self.b2)**epoch) / (1 - (1 - self.b1)**epoch)).astype(theano.config.floatX) #Adam for parameter, gradient, m, v in zip(self.params.values(), gradients, self.m.values(), self.v.values()): new_m = self.b1 * gradient + (1 - self.b1) * m new_v = self.b2 * (gradient**2) + (1 - self.b2) * v updates[ parameter] = parameter + self.learning_rate * gamma * new_m / ( T.sqrt(new_v) + 1e-8) updates[m] = new_m updates[v] = new_v train_data = theano.shared(train_data.transpose(1, 0, 2)).astype( theano.config.floatX) givens = { h0_enc: T.zeros((batch_end - batch_start, self.hidden_units_encoder)).astype(theano.config.floatX), x0: T.zeros((batch_end - batch_start, self.features)).astype(theano.config.floatX), x: train_data[:, batch_start:batch_end, :], labels: train_labels[batch_start:batch_end], keep_prob: self.keep_prob } self.updatefunction = theano.function([epoch, batch_start, batch_end], [logpxz.mean(), driver_loss], updates=updates, givens=givens, allow_input_downcast=True) x_val = theano.shared(val_data.transpose(1, 0, 2)).astype( theano.config.floatX) givens[x] = x_val[:, batch_start:batch_end, :] givens[labels] = val_labels[batch_start:batch_end] givens[keep_prob] = np.array(1.0).astype(theano.config.floatX) self.likelihood = theano.function( [batch_start, batch_end], [logpxz.mean(), driver_loss, max_minus_min, var, mean], givens=givens) x_test = T.tensor3("x_test") test_givens = { x: x_test, h0_enc: T.zeros((x_test.shape[1], self.hidden_units_encoder)).astype(theano.config.floatX), } self.encoder = theano.function([x_test], h_encoder, givens=test_givens) h_e = T.matrix('h_e') self.driver_predict = theano.function([h_e], driver_output, givens={ h_encoder: h_e, keep_prob: np.array(1.0).astype( theano.config.floatX) }) return True
test_data = [[[1, 2], [2, 3], [3, 4]], [[2, 1], [2, 2], [3, 3]]] shared_x = theano.shared(np.asarray(test_data, dtype=theano.config.floatX), borrow=True) inpt = T.matrix("inpt") cost = inpt * inpt i = T.lscalar("i") fn = theano.function([i], cost, givens={inpt: shared_x[i]}) z = fn(0) #print z test_q = np.array([[1, 2], [2, 3]]) hello_world_op = printing.Print('hello world') printed_x = hello_world_op(x) f = theano.function([x], printed_x) shared_xx = shared_x.reshape((6, 2)) shared_q = theano.shared(np.asarray(test_q, dtype=theano.config.floatX), borrow=True) x = T.lmatrix("x") print x.shape.eval({x: test_q}) print theano.function(inputs=[x], outputs=x.shape)(test_q) print test_q.shape fx = T.dmatrix("f") print theano.function(inputs=[fx], outputs=fx.shape)(shared_xx.eval())
def p(j, name): return printing.Print(name)(j)
def __init__(self, data_dir, word2vec, word_vector_size, truncate_gradient, learning_rate, dim, cnn_dim, cnn_dim_fc, story_len, patches, mode, answer_module, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.data_dir = data_dir self.learning_rate = learning_rate self.truncate_gradient = truncate_gradient self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.cnn_dim = cnn_dim self.cnn_dim_fc = cnn_dim_fc self.story_len = story_len self.mode = mode self.patches = patches self.answer_module = answer_module self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.vocab, self.ivocab = self._load_vocab(self.data_dir) self.train_story = None self.test_story = None self.train_dict_story, self.train_lmdb_env_fc, self.train_lmdb_env_conv = self._process_input_sind( self.data_dir, 'train') self.test_dict_story, self.test_lmdb_env_fc, self.test_lmdb_env_conv = self._process_input_sind( self.data_dir, 'val') self.train_story = self.train_dict_story.keys() self.test_story = self.test_dict_story.keys() self.vocab_size = len(self.vocab) # Since this is pretty expensive, we will pass a story each time. # We assume that the input has been processed such that the sequences of patches # are snake like path. self.input_var = T.tensor4( 'input_var') # (batch_size, seq_len, patches, cnn_dim) self.q_var = T.matrix('q_var') # Now, it's a batch * image_sieze. self.answer_var = T.imatrix( 'answer_var') # answer of example in minibatch self.answer_mask = T.matrix('answer_mask') self.answer_inp_var = T.tensor3( 'answer_inp_var') # answer of example in minibatch print "==> building input module" self.W_inp_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) #self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim,)) # First, we embed the visual features before sending it to the bi-GRUs. inp_rhp = T.reshape( self.input_var, (self.batch_size * self.story_len * self.patches, self.cnn_dim)) inp_rhp_dimshuffled = inp_rhp.dimshuffle(1, 0) inp_rhp_emb = T.dot(self.W_inp_emb_in, inp_rhp_dimshuffled) inp_rhp_emb_dimshuffled = inp_rhp_emb.dimshuffle(1, 0) inp_emb_raw = T.reshape( inp_rhp_emb_dimshuffled, (self.batch_size, self.story_len, self.patches, self.cnn_dim)) inp_emb = T.tanh( inp_emb_raw ) # Just follow the paper DMN for visual and textual QA. # Now, we use a bi-directional GRU to produce the input. # Forward GRU. self.inp_dim = self.dim / 2 # since we have forward and backward self.W_inpf_res_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpf_res_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpf_res = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) self.W_inpf_upd_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpf_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpf_upd = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) self.W_inpf_hid_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpf_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpf_hid = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) # Backward GRU. self.W_inpb_res_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpb_res_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpb_res = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) self.W_inpb_upd_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpb_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpb_upd = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) self.W_inpb_hid_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpb_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpb_hid = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) # Now, we use the GRU to build the inputs. # Two-level of nested scan is unnecessary. It will become too complicated. Just use this one. inp_dummy = theano.shared( np.zeros((self.inp_dim, self.story_len), dtype=floatX)) for i in range(self.batch_size): if i == 0: inp_1st_f, _ = theano.scan( fn=self.input_gru_step_forward, sequences=inp_emb[i, :].dimshuffle(1, 2, 0), outputs_info=T.zeros_like(inp_dummy), truncate_gradient=self.truncate_gradient) inp_1st_b, _ = theano.scan( fn=self.input_gru_step_backward, sequences=inp_emb[i, :, ::-1, :].dimshuffle(1, 2, 0), outputs_info=T.zeros_like(inp_dummy), truncate_gradient=self.truncate_gradient) # Now, combine them. inp_1st = T.concatenate([ inp_1st_f.dimshuffle(2, 0, 1), inp_1st_b.dimshuffle(2, 0, 1) ], axis=-1) self.inp_c = inp_1st.dimshuffle('x', 0, 1, 2) else: inp_f, _ = theano.scan( fn=self.input_gru_step_forward, sequences=inp_emb[i, :].dimshuffle(1, 2, 0), outputs_info=T.zeros_like(inp_dummy), truncate_gradient=self.truncate_gradient) inp_b, _ = theano.scan( fn=self.input_gru_step_backward, sequences=inp_emb[i, :, ::-1, :].dimshuffle(1, 2, 0), outputs_info=T.zeros_like(inp_dummy), truncate_gradient=self.truncate_gradient) # Now, combine them. inp_fb = T.concatenate( [inp_f.dimshuffle(2, 0, 1), inp_b.dimshuffle(2, 0, 1)], axis=-1) self.inp_c = T.concatenate( [self.inp_c, inp_fb.dimshuffle('x', 0, 1, 2)], axis=0) # Done, now self.inp_c should be batch_size x story_len x patches x cnn_dim # Eventually, we can flattern them. # Now, the input dimension is 1024 because we have forward and backward. inp_c_t = T.reshape( self.inp_c, (self.batch_size, self.story_len * self.patches, self.dim)) inp_c_t_dimshuffled = inp_c_t.dimshuffle(0, 'x', 1, 2) inp_batch = T.repeat(inp_c_t_dimshuffled, self.story_len, axis=1) # Now, its ready for all the 5 images in the same story. # 50 * 980 * 512 self.inp_batch = T.reshape(inp_batch, (inp_batch.shape[0] * inp_batch.shape[1], inp_batch.shape[2], inp_batch.shape[3])) self.inp_batch_dimshuffled = self.inp_batch.dimshuffle( 1, 2, 0) # 980 x 512 x 50 # It's very simple now, the input module just need to map from cnn_dim to dim. logging.info('self.cnn_dim = %d', self.cnn_dim) print "==> building question module" # First is for the global glimpse. q_var_3 = T.reshape(self.q_var, (self.batch_size, self.story_len, self.cnn_dim_fc)) q_var_shuffled = q_var_3.dimshuffle( 1, 2, 0) # now: story_len * image_size * batch_size # This is the RNN used to produce the Global Glimpse self.W_qf_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim_fc)) self.W_qf_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_qf_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_qf_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim_fc)) self.W_qf_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_qf_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_qf_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim_fc)) self.W_qf_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_qf_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_dummy = theano.shared( np.zeros((self.dim, self.batch_size), dtype=floatX)) q_glb, _ = theano.scan(fn=self.q_gru_step_forward, sequences=q_var_shuffled, outputs_info=[T.zeros_like(inp_dummy)], truncate_gradient=self.truncate_gradient) q_glb_shuffled = q_glb.dimshuffle(2, 0, 1) # batch_size * seq_len * dim q_glb_last = q_glb_shuffled[:, -1, :] # batch_size * dim # Now, we also need to add the global glimpse, thus we need to use the rnn to build the attention glimpose. # Now, share the parameter with the input module. self.W_inp_emb_q = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim_fc)) self.b_inp_emb_q = nn_utils.normal_param(std=0.1, shape=(self.dim, )) q_var_shuffled = self.q_var.dimshuffle(1, 0) inp_q = T.dot( self.W_inp_emb_q, q_var_shuffled) + self.b_inp_emb_q.dimshuffle( 0, 'x') # 512 x 50 self.q_q = T.tanh( inp_q ) # Since this is used to initialize the memory, we need to make it tanh. print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): #m = printing.Print('mem')(memory[iter-1]) current_episode = self.new_episode(memory[iter - 1]) #current_episode = self.new_episode(m) #current_episode = printing.Print('current_episode')(current_episode) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle((1, 0)) net = layers.InputLayer(shape=(self.batch_size * self.story_len, self.dim), input_var=last_mem_raw) if self.batch_norm: net = layers.BatchNormLayer(incoming=net) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net).dimshuffle((1, 0)) logging.info('last_mem size') print last_mem.shape.eval({ self.input_var: np.random.rand(10, 5, 196, 512).astype('float32'), self.q_var: np.random.rand(50, 4096).astype('float32') }) print "==> building answer module" answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1, 2, 0) # Sounds good. Now, we need to map last_mem to a new space. self.W_mem_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 3)) self.W_inp_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.vocab_size + 1)) def _dot2(x, W): return T.dot(W, x) answer_inp_var_shuffled_emb, _ = theano.scan( fn=_dot2, sequences=answer_inp_var_shuffled, non_sequences=self.W_inp_emb, truncate_gradient=self.truncate_gradient) # seq x dim x batch # Now, we also need to embed the image and use it to do the memory. #q_q_shuffled = self.q_q.dimshuffle(1,0) # dim * batch. q_glb_dim = q_glb_last.dimshuffle(0, 'x', 1) # batch_size * 1 * dim q_glb_repmat = T.repeat(q_glb_dim, self.story_len, 1) # batch_size * len * dim q_glb_rhp = T.reshape(q_glb_repmat, (q_glb_repmat.shape[0] * q_glb_repmat.shape[1], q_glb_repmat.shape[2])) init_ans = T.concatenate( [self.q_q, last_mem, q_glb_rhp.dimshuffle(1, 0)], axis=0) mem_ans = T.dot(self.W_mem_emb, init_ans) # dim x batchsize. mem_ans = printing.Print('prob_sm')(mem_ans) mem_ans_dim = mem_ans.dimshuffle('x', 0, 1) answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb], axis=0) # Now, we have both embedding. We can let them go to the rnn. # We also need to map the input layer as well. dummy = theano.shared( np.zeros((self.dim, self.batch_size * self.story_len), dtype=floatX)) self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size + 1, self.dim)) self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) logging.info('answer_inp size') #print answer_inp.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32')}) #last_mem = printing.Print('prob_sm')(last_mem) results, _ = theano.scan(fn=self.answer_gru_step, sequences=answer_inp, outputs_info=[dummy], truncate_gradient=self.truncate_gradient) # Assume there is a start token #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore') results = results[ 1: -1, :, :] # get rid of the last token as well as the first one (image) #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore') # Now, we need to transform it to the probabilities. prob, _ = theano.scan(fn=lambda x, w: T.dot(w, x), sequences=results, non_sequences=self.W_a, truncate_gradient=self.truncate_gradient) prob_shuffled = prob.dimshuffle(2, 0, 1) # b * len * vocab logging.info("prob shape.") #print prob.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}) n = prob_shuffled.shape[0] * prob_shuffled.shape[1] prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2])) prob_sm = nn_utils.softmax_(prob_rhp) self.prediction = prob_sm mask = T.reshape(self.answer_mask, (n, )) lbl = T.reshape(self.answer_var, (n, )) self.params = [ self.W_inp_emb_in, #self.b_inp_emb_in, self.W_inpf_res_in, self.W_inpf_res_hid, self.b_inpf_res, self.W_inpf_upd_in, self.W_inpf_upd_hid, self.b_inpf_upd, self.W_inpf_hid_in, self.W_inpf_hid_hid, self.b_inpf_hid, self.W_inpb_res_in, self.W_inpb_res_hid, self.b_inpb_res, self.W_inpb_upd_in, self.W_inpb_upd_hid, self.b_inpb_upd, self.W_inpb_hid_in, self.W_inpb_hid_hid, self.b_inpb_hid, self.W_qf_res_in, self.W_qf_res_hid, self.b_qf_res, self.W_qf_upd_in, self.W_qf_upd_hid, self.b_qf_upd, self.W_qf_hid_in, self.W_qf_hid_hid, self.b_qf_hid, self.W_inp_emb_q, self.b_inp_emb_q, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a, self.W_mem_emb, self.W_inp_emb, self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid, ] print "==> building loss layer and computing updates" loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl) self.loss_ce = (mask * loss_vec).sum() / mask.sum() #self.loss_ce = T.nnet.categorical_crossentropy(results_rhp, lbl) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params, learning_rate=self.learning_rate) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var ], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var ], outputs=[self.prediction, self.loss])
def watch(x): def func(_, x): import ipdb; ipdb.set_trace() return TP.Print(global_fn=func)(x)
def __theano_build__(self): E, V, U, W, b, c, v = self.E, self.V, self.U, self.W, self.b, self.c, self.v x = T.dvector('x') y = T.dvector('y') def forward_prop_step(x_t, s_t1_prev, s_t2_prev, s_t3_prev): p_o = printing.Print('juju') # Word embedding layer x_e = x_t.dot(E).T + v def GRU(i, U, W, b, x_0, s_prev): b1 = b[i * 3, :] b2 = b[i * 3 + 1, :] b3 = b[i * 3 + 2, :] z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_prev) + b1) r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_prev) + b2) c = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_prev * r) + b3) return ((T.ones_like(z) - z) * c + z * s_prev).astype(theano.config.floatX) p_o = printing.Print('juju') s = [[], [], []] # GRU Layer 1 s[0] = GRU(0, U, W, b, x_e, s_t1_prev) # GRU Layer 2 s[1] = GRU(1, U, W, b, s[0], s_t2_prev) # GRU Layer 3 s[2] = GRU(2, U, W, b, s[1], s_t3_prev) # Final output calculation o_t = (V.dot(s[2]) + c)[0] return [o_t, s[0], s[1], s[2]] # p_o = printing.Print('prediction') [o, s, s2, s3], updates = theano.scan( forward_prop_step, sequences=x, truncate_gradient=self.bptt_truncate, outputs_info=[None, dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim))]) p_o = printing.Print('o') # p_y = printing.Print('y') prediction = o e = prediction - y o_last = o[-1] o_error = T.sum(T.pow(prediction.T - y, 2)) / (2 * T.shape(y)[1]) # Total cost (could add regularization here) cost = o_error # Gradients dE = T.grad(cost, E) dU = T.grad(cost, U) dW = T.grad(cost, W) db = T.grad(cost, b) dV = T.grad(cost, V) dc = T.grad(cost, c) dv = T.grad(cost, v) # Assign functions self.predict = theano.function([x], [o]) self.predict_last = theano.function([x], [o_last]) self.predict_class = theano.function([x, y], [prediction, e], allow_input_downcast=True) self.error = theano.function([x, y], e) self.ce_error = theano.function([x, y], cost, allow_input_downcast=True) self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc], allow_input_downcast=True) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates mE = (decay * self.mE + (1 - decay) * dE ** 2).astype(theano.config.floatX) mU = (decay * self.mU + (1 - decay) * dU ** 2).astype(theano.config.floatX) mW = (decay * self.mW + (1 - decay) * dW ** 2).astype(theano.config.floatX) mV = (decay * self.mV + (1 - decay) * dV ** 2).astype(theano.config.floatX) mb = (decay * self.mb + (1 - decay) * db ** 2).astype(theano.config.floatX) mc = (decay * self.mc + (1 - decay) * dc ** 2).astype(theano.config.floatX) mv = (decay * self.mv + (1 - decay) * dv ** 2).astype(theano.config.floatX) self.sgd_step = theano.function( [x, y, learning_rate, theano.In(decay, value=0.9)], [], updates=[(E, E - (learning_rate * dE / T.sqrt(mE + 1e-6)).astype(theano.config.floatX)), (U, U - (learning_rate * dU / T.sqrt(mU + 1e-6)).astype(theano.config.floatX)), (W, W - (learning_rate * dW / T.sqrt(mW + 1e-6)).astype(theano.config.floatX)), (V, V - (learning_rate * dV / T.sqrt(mV + 1e-6)).astype(theano.config.floatX)), (b, b - (learning_rate * db / T.sqrt(mb + 1e-6)).astype(theano.config.floatX)), (c, c - (learning_rate * dc / T.sqrt(mc + 1e-6)).astype(theano.config.floatX)), (self.mE, mE), (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb), (self.mc, mc), (self.mv, mv) ], allow_input_downcast=True)
from keras import backend as K from keras.models import Sequential from keras.engine.topology import Layer from keras.engine import InputSpec from keras import initializations, activations import theano import theano.tensor as T import theano.printing as printing from keras.backend.common import _EPSILON from theano.tensor.shared_randomstreams import RandomStreams import numpy as np epsilon = 0.1 p = printing.Print('x') # USE BELOW TAGS FOR DEBUGGING # theano.config.optimizer = 'None' # theano.config.exception_verbosity ='high' # theano.optimizer='fast_compile' def get_vector(curr_word, new_sense, W_g, W_s): cond = T.eq(new_sense, -1) return T.switch(cond, W_g[curr_word], W_s[curr_word, new_sense]) # update the sense of a word in the context vector def change_context_vec(vect, new_sense, prev_sense, curr_word, W_g, W_s): return vect - get_vector(curr_word, prev_sense, W_g, W_s) + get_vector( curr_word, new_sense, W_g, W_s)
def __theano_build__(self): E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c x = T.ivector('x') y = T.ivector('y') def forward_prop_step(x_t, s_t1_prev, s_t2_prev, s_t3_prev): # Word embedding layer x_e = E[:, x_t] # GRU Layer 1 z_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) + b[0]) r_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) + b[1]) c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2]) s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev # GRU Layer 2 z_t2 = T.nnet.hard_sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) + b[3]) r_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) + b[4]) c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5]) s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev # GRU Layer 3 z_t3 = T.nnet.hard_sigmoid(U[6].dot(s_t2) + W[6].dot(s_t3_prev) + b[6]) r_t3 = T.nnet.hard_sigmoid(U[7].dot(s_t2) + W[7].dot(s_t3_prev) + b[7]) c_t3 = T.tanh(U[8].dot(s_t2) + W[8].dot(s_t3_prev * r_t3) + b[8]) s_t3 = (T.ones_like(z_t3) - z_t3) * c_t3 + z_t3 * s_t3_prev # Final output calculation # Theano's softmax returns a matrix with one row, we only need the row o_t = T.nnet.softmax(V.dot(s_t3) + c)[0] return [o_t, s_t1, s_t2, s_t3] [o, s, s2, s3], updates = theano.scan(forward_prop_step, sequences=x, truncate_gradient=self.bptt_truncate, outputs_info=[ None, dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim)) ]) prediction = T.argmax(o, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(o, y)) p_o = printing.Print('o_error') # Total cost (could add regularization here) cost = p_o(o_error) # Gradients dE = T.grad(cost, E) dU = T.grad(cost, U) dW = T.grad(cost, W) db = T.grad(cost, b) dV = T.grad(cost, V) dc = T.grad(cost, c) # Assign functions self.predict = theano.function([x], [o], allow_input_downcast=True) self.predict_class = theano.function([x], prediction, allow_input_downcast=True) self.ce_error = theano.function([x, y], cost, allow_input_downcast=True) self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc], allow_input_downcast=True) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates mE = decay * self.mE + (1 - decay) * dE**2 mU = decay * self.mU + (1 - decay) * dU**2 mW = decay * self.mW + (1 - decay) * dW**2 mV = decay * self.mV + (1 - decay) * dV**2 mb = decay * self.mb + (1 - decay) * db**2 mc = decay * self.mc + (1 - decay) * dc**2 self.sgd_step = theano.function( [x, y, learning_rate, theano.In(decay, value=0.9)], [], updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)), (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)), (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)), (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)), (b, b - learning_rate * db / T.sqrt(mb + 1e-6)), (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)), (self.mE, mE), (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb), (self.mc, mc)], allow_input_downcast=True)
def _sinkhorn_log(Mu, Nu, C, options): """ Theano symbolic version. Note that the gradient wrt Mu, Nu and C is computed as if the transport plan "gamma(Mu,Nu,C)" was piecewise constant. """ # First, load the parameters. epsilon = options.epsilon # regularization parameter niter = options.niter # max niter in the sinkhorn loop tau = options.tau # use for acceleration rho = options.rho # parameter for unbalanced transport use_dual_cost = options.dual_cost # If False, use the primal cost discard_entropy = options.discard_entropy # If True + primal cost, remove the -eps*H(gamma) discard_KL = options.discard_KL # If True + primal cost, remove the rho*KL(...) grad_override_hack = options.grad_hack display_error = options.display_error # Update exponent : if rho == inf: # balanced transport : no mass creation is allowed. lam = 1. else: # lam = 1 / (1 + epsilon/rho) lam = rho / (rho + epsilon) # First, define the transport plan theano "Op" --------------------------------- # it takes as input three Theano variables : if grad_override_hack: mu = T.vector('mu') nu = T.vector('nu') c = T.matrix('c') else: mu = Mu nu = Nu c = C # Elementary operations .................................................. def ave(u, u1, it): """ Barycenter subroutine, used by kinetic acceleration through extrapolation. tau = 0 -> returns u1. tau < 0 -> returns an extrapolation coming from u. Note that doing it on the "exponentiated" variables would not make any sense. """ t = tau #t = (1. - 1./((it+2.)**2)) * tau return t * u + (1 - t) * u1 def M(u, v): """ M_ij = (-c_ij + u_i + v_j) / epsilon """ u_col = u.dimshuffle( 0, 'x' ) # theano syntax to make a vector broadcastable in the 2nd dimension v_row = v.dimshuffle( 'x', 0 ) # theano syntax to make a vector broadcastable in the 1st dimension return (-c + u_col + v_row) / epsilon lse = lambda A: T.log(T.sum(T.exp(A), axis=1) + 1e-6 ) # slight modif to prevent NaN # Actual Sinkhorn loop .................................................. # Iteration step : def sinkhorn_step(nit, u, v, foo): u1 = u # useful to check the update u = ave(u, lam * (epsilon * (T.log(mu) - lse(M(u, v))) + u), nit[0]) v = ave(v, lam * (epsilon * (T.log(nu) - lse(M(u, v).T)) + v), nit[0]) if rho == inf: err = T.sum(abs(T.sum(T.exp(M(u, v)), 1) - mu)) else: err = T.sum(abs(u - u1)) return (u, v, err), theano.scan_module.until( err < 1e-4) # "break" the scan loop if error < tol # Scan = "For loop" : iternumbers = np.arange(niter, dtype=config.floatX) iternumbers = stack((iternumbers, iternumbers), 1) """ result, updates = theano.scan_checkpoints(fn = sinkhorn_step, # Iterated routine sequences = [iternumbers], outputs_info = [(0. * mu), (0. * nu)], # Starting estimates for [u,v] save_every_N = niter, padding=False ) # Efficient memory management, at an additional computational cost """ err0 = np.arange(1, dtype=config.floatX)[0] result, updates = theano.scan( fn=sinkhorn_step, # Iterated routine sequences=[iternumbers], outputs_info=[(0. * mu), (0. * nu), err0] # Starting estimates for [u,v] #n_steps = niter # Number of iterations ) u, v = result[0][-1], result[1][ -1] # We only keep the final dual variables gamma = T.exp(M(u, v)) # Eventual transport plan g = diag(a)*K*diag(b) # Gradient override ..................................................... if grad_override_hack: # We give U,V,Gamma, albeit with a "hacked" explicit (i.e. autodiff-free) derivative # HERE, WE USE A DEV VERSION which allows : # - grad overrides # - inlining (for GPU integration) # See pull request 5255 on Theano's Github. if use_dual_cost: hack_derivative = lambda x, g: [0 * x[0], 0 * x[1], 0 * x[2]] _transport_plan = OpFromGraph([mu, nu, c], [u, v, gamma], inline=True, grad_overrides=hack_derivative) U, V, Gamma = _transport_plan(Mu, Nu, C) else: null_derivative = lambda x, g: [0 * x[0], 0 * x[1], 0 * x[2]] _transport_plan = OpFromGraph([mu, nu, c], [gamma], inline=True, grad_overrides=null_derivative) Gamma = _transport_plan(Mu, Nu, C) else: U, V, Gamma = u, v, gamma # Final cost computation ................................................. if use_dual_cost: """ print_U = printing.Print('U : ', attrs = [ 'shape' ]) ; U = print_U(U) print_Mu = printing.Print('Mu : ', attrs = [ 'shape' ]) ; Mu = print_Mu(Mu) print_V = printing.Print('V : ', attrs = [ 'shape' ]) ; V = print_V(V) print_Nu = printing.Print('Nu : ', attrs = [ 'shape' ]) ; Nu = print_Nu(Nu) print_G = printing.Print('G : ', attrs = [ 'shape' ]) ; Gamma = print_G(Gamma) """ if grad_override_hack: # allow the first term to have a derivative wrt x plan = T.matrix('plan') cost_matrix = T.matrix('cost_matrix') virtual_cost = T.sum(plan * cost_matrix) #hack_derivative = lambda x,g : [ 0 * x[0], T.grad(virtual_cost, _firstterm = OpFromGraph([plan, cost_matrix], [-epsilon * T.sum(plan)], inline=True, grad_overrides=hack_derivative) cost = _firstterm(Gamma, C) else: cost = -epsilon * T.sum(Gamma) if rho == inf: cost += T.sum(Mu * U) + T.sum(Nu * V) else: cost += - rho * (T.sum( Mu * (T.exp( -U / rho ) - 1) ) \ + T.sum( Nu * (T.exp( -V / rho ) - 1) ) ) else: xlogx = lambda x: x * T.log(x + 1e-6) xlogy0 = lambda x, y: x * T.log(y + 1e-6) H = lambda g: -T.sum(xlogx(g) - g) # Primal : if discard_entropy: cost = T.sum(Gamma * C) else: cost = T.sum(Gamma * C) - epsilon * H(Gamma) KL = lambda h, p: T.sum(xlogy0(h, h / p) - h + p) if rho != inf and not discard_KL: # We add the KL divergences KL_1 = KL(T.sum(Gamma, 1), Mu) KL_2 = KL(T.sum(Gamma, 0), Nu) cost += rho * (KL_1 + KL_2) if display_error: print_err_shape = printing.Print('error : ', attrs=['shape']) errors = print_err_shape(result[2]) print_err = printing.Print('error : ') err_fin = print_err(errors[-1]) cost += .00000001 * err_fin # shameful hack to prevent the pruning of the error-printing node... return [cost, Gamma]