Пример #1
0
        def forward_prop_step(x_t, y_t, s_t_prev):
            # Word embedding layer
            x_e = E_x[:, x_t]
            y_e = E_y[:, y_t]

            def GRU(i, U, W, b, x_0, s_prev):
                pb = printing.Print('b')

                b1 = T.specify_shape((self.conversion_ones * b[i * 3, :]).T, T.shape(x_0))
                b2 = T.specify_shape((self.conversion_ones * b[i * 3 + 1, :]).T, T.shape(x_0))
                b3 = T.specify_shape((self.conversion_ones * b[i * 3 + 2, :]).T, T.shape(x_0))

                z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_prev) + b1)
                r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_prev) + b2)
                c = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_prev * r) + b3)

                return (T.ones_like(z) - z) * c + z * s_prev

            p_o = printing.Print('juju')

            # GRU Layer 1
            s[0] = GRU(0, U, W, b, x_e, s_t_prev[0])

            # GRU Layer 2
            s[1] = GRU(1, U, W, b, y_e, s_t_prev[1])

            c_matrix = (self.conversion_ones * c).T
            juju = V.dot(s) + c_matrix
            ot = printing.Print("o_t")
            o_t = T.nnet.softmax(juju.T).T

            return [o_t, s]
Пример #2
0
        def forward_prop_step(x_t, s_t1_prev, s_t2_prev, s_t3_prev):
            p_o = printing.Print('juju')
            # Word embedding layer
            x_e = x_t.dot(E).T + v

            def GRU(i, U, W, b, x_0, s_prev):
                b1 = b[i * 3, :]
                b2 = b[i * 3 + 1, :]
                b3 = b[i * 3 + 2, :]

                z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_prev) + b1)
                r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_prev) + b2)
                c = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_prev * r) + b3)

                return ((T.ones_like(z) - z) * c + z * s_prev).astype(theano.config.floatX)

            p_o = printing.Print('juju')
            s = [[], [], []]
            # GRU Layer 1
            s[0] = GRU(0, U, W, b, x_e, s_t1_prev)

            # GRU Layer 2
            s[1] = GRU(1, U, W, b, s[0], s_t2_prev)

            # GRU Layer 3
            s[2] = GRU(2, U, W, b, s[1], s_t3_prev)

            # Final output calculation

            o_t = (V.dot(s[2]) + c)[0]

            return [o_t, s[0], s[1], s[2]]
Пример #3
0
def _ot_matching(q1_x, q1_mu, xt_x, xt_mu, radius) :
	"""
	Given two measures q1 and xt represented by locations/weights arrays, 
	outputs an optimal transport fidelity term and the transport plan.
	"""
	# The Sinkhorn algorithm takes as input three Theano variables :
	c = _squared_distances(q1_x, xt_x) # Wasserstein cost function
	mu = q1_mu ; nu = xt_mu
	
	# Parameters of the Sinkhorn algorithm.
	epsilon            = (.02)**2          # regularization parameter
	rho                = (.5) **2          # unbalanced transport (See PhD Th. of Lenaic Chizat)
	niter              = 10000             # max niter in the sinkhorn loop
	tau                = -.8               # nesterov-like acceleration
	
	lam = rho / (rho + epsilon)            # Update exponent
	
	# Elementary operations .....................................................................
	def ave(u,u1) : 
		"Barycenter subroutine, used by kinetic acceleration through extrapolation."
		return tau * u + (1-tau) * u1 
	def M(u,v)  : 
		"$M_{ij} = (-c_{ij} + u_i + v_j) / \epsilon$"
		return (-c + u.dimshuffle(0,'x') + v.dimshuffle('x',0)) / epsilon
	lse = lambda A    : T.log(T.sum( T.exp(A), axis=1 ) + 1e-6) # slight modif to prevent NaN
	
	# Actual Sinkhorn loop ......................................................................
	# Iteration step :
	def sinkhorn_step(u, v, foo) :
		u1=u # useful to check the update
		u = ave( u, lam * ( epsilon * ( T.log(mu) - lse(M(u,v))   ) + u ) )
		v = ave( v, lam * ( epsilon * ( T.log(nu) - lse(M(u,v).T) ) + v ) )
		err = T.sum(abs(u - u1))
		
		return (u,v,err), theano.scan_module.until(err < 1e-4) # "break" the loop if error < tol
		
	# Scan = "For loop" :
	err0 = np.arange(1, dtype=config.floatX)[0]
	result, updates = theano.scan( fn            = sinkhorn_step,            # Iterated routine
								   outputs_info  = [(0.*mu), (0.*nu), err0], # Starting estimates
								   n_steps       = niter                   # Number of iterations
								 )    
	U, V = result[0][-1], result[1][-1] # We only keep the final dual variables
	Gamma = T.exp( M(U,V) )             # Eventual transport plan g = diag(a)*K*diag(b)
	cost  = T.sum( Gamma * c )         # Simplistic cost, chosen for readability in this tutorial
	if False :
		print_err_shape = printing.Print('error  : ', attrs=['shape'])
		errors          = print_err_shape(result[2])
		print_err  = printing.Print('error  : ') ; err_fin  = print_err(errors[-1])
		cost += .00000001 * err_fin   # hack to prevent the pruning of the error-printing node...
	return [cost, Gamma]
Пример #4
0
    def localConv(doc, dsn, swnv, dww, sww):
        #         t = T.arange(docSentenceSize)
        #         ccc = docs[t.nonzero()]

        t = T.arange(dsn).nonzero()

        t = (T.arange(10000) < dsn).nonzero()
        #         print t
        #         t=T.arange(dsn)
        docSub = doc[t]
        p = printing.Print('docSub')
        docSub = p(docSub)
        swnvSub = swnv[t]

        def sentenceConv(sen, wn, sww):
            t = (T.arange(10000) < wn).nonzero()
            senSub = sen[t]
            convRes = theano.tensor.signal.conv.conv2d(senSub, sww)
            sentence_pool = theano.tensor.signal.downsample.max_pool_2d(
                convRes, (100000, 1)).flatten(1)
            return sentence_pool

        sentenceLayer, _ = theano.scan(
            fn=lambda sen, wn, sww: sentenceConv(sen, wn, sww),
            non_sequences=[sww],
            sequences=[docSub, swnvSub])

        convRes = theano.tensor.signal.conv.conv2d(sentenceLayer, dww)

        sentence_pool = theano.tensor.signal.downsample.max_pool_2d(
            convRes, (100000, 1)).flatten(1)
        return sentence_pool
Пример #5
0
    def get_cost_updates(self, corruption_level, learning_rate):
        """ This function computes the cost and the updates for one trainng
        step of the dA """

        tilde_x = self.get_corrupted_input(self.x, corruption_level)
        y = self.get_hidden_values2(tilde_x)
        z = self.get_reconstructed_input(y)
        # note : we sum over the size of a datapoint; if we are using
        #        minibatches, L will be a vector, with one entry per
        #        example in minibatch
        diff = (self.x - z) * (self.x - z)
        #print diff.eval()
        L = T.sum(diff, axis=1)
        printing.Print(L)
        # L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1)
        # note : L is now a vector, where each element is the
        #        cross-entropy cost of the reconstruction of the
        #        corresponding example of the minibatch. We need to
        #        compute the average of all these to get the cost of
        #        the minibatch
        cost = T.mean(L)

        # compute the gradients of the cost of the `dA` with respect
        # to its parameters
        gparams = T.grad(cost, self.params)
        # generate the list of updates
        updates = [(param, param - learning_rate * gparam)
                   for param, gparam in zip(self.params, gparams)]

        return (cost, updates)
Пример #6
0
        def forward_prop_step(x_t, s_t1_prev, s_t2_prev, s_t3_prev):
            p_o = printing.Print('juju')
            # Word embedding layer
            x_t = T.reshape(x_t, (T.shape(x_t)[0], 1))
            x_e = x_t.dot(E).T + coversion_ones.v

            def GRU(i, U, W, b, x_0, s_prev):
                coversion_ones = T.ones((1, self.mini_batch_dim))

                b1 = T.reshape(b[i * 3, :],
                               (T.shape(b)[1], 1)).dot(coversion_ones)
                b2 = T.reshape(b[i * 3 + 1, :],
                               (T.shape(b)[1], 1)).dot(coversion_ones)
                b3 = T.reshape(b[i * 3 + 2, :],
                               (T.shape(b)[1], 1)).dot(coversion_ones)

                z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) +
                                        W[i * 3 + 0].dot(s_prev) + b1)
                r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) +
                                        W[i * 3 + 1].dot(s_prev) + b2)
                c = T.tanh(U[i * 3 + 2].dot(x_0) +
                           W[i * 3 + 2].dot(s_prev * r) + b3)

                return ((T.ones_like(z) - z) * c + z * s_prev).astype(
                    theano.config.floatX)

            p_o = printing.Print('juju')
            s = [[], [], []]
            # GRU Layer 1
            s[0] = GRU(0, U, W, b, x_e, s_t1_prev)

            # GRU Layer 2
            s[1] = GRU(1, U, W, b, s[0], s_t2_prev)

            # GRU Layer 3
            s[2] = GRU(2, U, W, b, s[1], s_t3_prev)

            # Final output calculation

            c_matrix = (coversion_ones.dot(c)).T
            o_t = ((V).dot(s[2]) + c_matrix)[0]

            return [o_t, s[0], s[1], s[2]]
Пример #7
0
            def GRU(i, U, W, b, x_0, s_prev):
                pb = printing.Print('b')

                b1 = T.specify_shape((self.conversion_ones * b[i * 3, :]).T, T.shape(x_0))
                b2 = T.specify_shape((self.conversion_ones * b[i * 3 + 1, :]).T, T.shape(x_0))
                b3 = T.specify_shape((self.conversion_ones * b[i * 3 + 2, :]).T, T.shape(x_0))

                z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_prev) + b1)
                r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_prev) + b2)
                c = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_prev * r) + b3)

                return (T.ones_like(z) - z) * c + z * s_prev
Пример #8
0
def masked_categorical_crossentropy(output, target, mask, from_logits=False):
    if from_logits:
        output = T.nnet.softmax(output)
    else:
        # scale preds so that the class probas of each sample sum to 1
        output /= output.sum(axis=-1, keepdims=True)
    # avoid numerical instability with _EPSILON clipping
    output = T.clip(output, _EPSILON, 1.0 - _EPSILON)

    objective = -T.sum(target * T.log(output), axis=output.ndim - 1)

    objective = T.set_subtensor(
        objective[T.or_(T.eq(target[:, :, mask], 1), T.eq(target[:, :, 0],
                                                          1)).nonzero()], 0.0)

    return printing.Print('Objective', global_fn=_debug_fn)(objective)
Пример #9
0
def put_hook(variable, hook_fn):
    """Put a hook on a Theano variables.

    Ensures that the hook function is executed every time when the value
    of the Theano variable is available.

    Parameters
    ----------
    variable : :class:`~tensor.TensorVariable`
        The variable to put a hook on.
    hook_fn : function
        The hook function. Should take a single argument: the variable's
        value.

    """
    return printing.Print(global_fn=lambda _, x: hook_fn(x))(variable)
Пример #10
0
def print_tensor(message, variable):
    """A small helper function that makes printing Theano variables a little bit
    easier.

    :type message: str
    :param message: message, typically the variable name

    :type variable: TensorVariable
    :param variable: any tensor variable to be printed

    :rtype: TensorVariable
    :returns: a tensor variable to be used further down the graph in place of
              ``variable``
    """

    print_op = printing.Print(message)
    return print_op(variable)
Пример #11
0
    def get_train_fn(self):
        X = T.dmatrix("X")
        Y = self.get_output_values(self.get_hidden_values(X))
        self.f1 = theano.function([X], Y)

        advantage = T.dmatrix("advantage")
        loss = -T.sum(advantage * Y)
        #updates = self.get_cost_updates(X, loss)

        grad_params = T.grad(loss, self.params)
        param_printing_op = printing.Print("Param")
        param_printing = param_printing_op(grad_params[0])
        updates = [(param, param - learning_rate * grad_param)
                   for param, grad_param in zip(self.params, grad_params)]
        #updates = self.RMSprop(loss, self.params)

        self.f2 = theano.function([X, advantage], [param_printing],
                                  updates=updates)
Пример #12
0
        def forward_prop_step(x_t, s_t1_prev, s_t2_prev, s_t3_prev):
            # Word embedding layer
            x_e = E[:, x_t]

            # def GRU(i, U, W, b, x_0, s_prev):
            #     z = T.nnet.hard_sigmoid(x_0.dot(U[i * 3 + 0].T) + s_prev.dot(W[i * 3 + 0].T) + b[i * 3 + 0])
            #     r = T.nnet.hard_sigmoid(x_0.dot(U[i * 3 + 1].T) + s_prev.dot(W[i * 3 + 1].T) + b[i * 3 + 1])
            #     c = T.tanh(x_0.dot(U[i * 3 + 2].T) + (s_prev * r).dot(W[i * 3 + 2].T) + b[i * 3 + 2])
            #     return (T.ones_like(z) - z) * c + z * s_prev

            def GRU(i, U, W, b, x_0, s_prev):
                b1 = T.specify_shape((coversion_ones*b[i * 3,:]).T, T.shape(x_0))
                b2 = T.specify_shape((coversion_ones*b[i * 3 + 1 ,:]).T, T.shape(x_0))
                b3 = T.specify_shape((coversion_ones*b[i * 3 + 2,:]).T, T.shape(x_0))

                z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_prev) + b1)
                r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_prev) + b2)
                c = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_prev * r) + b3)

                return (T.ones_like(z) - z) * c + z * s_prev

            p_o = printing.Print('juju')
            s = [s_t1_prev, s_t2_prev, s_t3_prev]
            # GRU Layer 1
            s[0] = GRU(0, U, W, b, x_e, s_t1_prev)

            # GRU Layer 2
            s[1] = GRU(1, U, W, b, s[0], s_t2_prev)

            # GRU Layer 3
            s[2] = GRU(2, U, W, b, s[1], s_t3_prev)

            # Final output calculation
            # Theano's softmax returns a matrix with one row, we only need the row

            c_matrix = (coversion_ones * c).T
            juju = V.dot(s[2]) + c_matrix

            o_t = T.nnet.softmax(juju.T).T

            return [o_t, s[0], s[1], s[2]]
Пример #13
0
 def model_layers(self, x, *args):
     """
     Définie le model pour toutes les couches
     :param x: entrée
     :param args: liste des entrées (du temps précédent) pour les lstm
     """
     args = list(args)
     num_arg = 0
     outputs = []
     num_passage = args[-1]
     for k in range(len(self.layers)):
         debug_printing = pr.Print('Progress', global_fn=self.print_callback)(num_passage+1)
         layer = self.layers[k]
         if layer['type'] == 'simple':  # si c'est un simple on utilise le model simple
             x = self.model_simple_layer(x, k)
         elif layer['type'] == 'lstm':  # sinon celui de lstm
             h, c = self.model_lstm_layer(x, args[num_arg], args[num_arg+1], k)
             x = h  # la sortie est h
             num_arg += 2
             outputs.append(h)  # nouveau h
             outputs.append(c)  # nouveau x
     outputs = [x] + outputs  # les sorties sont la sortie finale x et les valeurs intermédiaires à repasser au réseau au temps suivant
     outputs.append(debug_printing)
     return tuple(outputs)  # x (output), vals_t, ...
Пример #14
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              **kwargs
              ):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)

            # Initialize with pretrained embeddings
            if pre_emb and training:
                
                # Randomly generates new weights
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                
                # Here is where we will substitute pyemblib read function.
                # Syntax: get_embedding_dict(emb_path, emb_format, first_n, vocab)
                emb_format = pyemblib2.Format.Word2Vec
                pretrained = get_embedding_dict(pre_emb, emb_format, 0, None)
                ''' 
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]
                        ).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                '''
                
                c_found = 0
                c_lower = 0
                c_zeros = 0

                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[
                            re.sub('\d', '0', word.lower())
                        ]
                        c_zeros += 1
                
                # This is it, this is what needs to be printed.
                # "word_layer.embeddings" is a "theano.shared" object 
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print ('%i / %i (%.4f%%) words have been initialized with '
                       'pretrained embeddings.') % (
                            c_found + c_lower + c_zeros, n_words,
                            100. * (c_found + c_lower + c_zeros) / n_words
                      )
                print ('%i found directly, %i after lowercasing, '
                       '%i after lowercasing + zero.') % (
                          c_found, c_lower, c_zeros
                      )

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]
            char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        inputs = T.concatenate(inputs, axis=1) if len(inputs) != 1 else inputs[0]

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate(
                [word_for_output, word_rev_output],
                axis=1
            )
            tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
                                     name='tanh_layer', activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))],
                axis=1
            )
            observations = T.concatenate(
                [b_s, observations, e_s],
                axis=0
            )

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]
            ].sum()

            all_paths_scores = forward(observations, transitions)
            cost = - (real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)

            # Supposedly the commented-out line below will stop
            # the model from updating the pretrained emeddings.
 
            # params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            
            # "params" supposedly contains the pretrained embedding matrix that we are updating. 
            # Find the "get_updates" function and figure out what it does.  
            updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(
                inputs=train_inputs,
                outputs=cost,
                updates=updates,
                givens=({is_train: np.cast['int32'](1)} if dropout else {})
            )
            #========================================
            # FUNCTION TO PRINT PRETRAINED EMBEDDINGS
            # The function below takes one argument, which it prints
            # along with the specified print message.
            print_matrix = T.dmatrix() 
            print_op = printing.Print('print message') 
            printed_x = print_op(print_matrix)
            f_print = function([print_matrix], printed_x) 
            #========================================
        else:
            f_train = None
            f_print = None

        # We return a tuple of things used to print the embedding so that it looks nicer. 
        print_tuple = [f_print, word_layer.embeddings]

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores,
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )
        else:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=forward(observations, transitions, viterbi=True,
                                return_alpha=False, return_best_sequence=True),
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )

        return f_train, f_eval, print_tuple
Пример #15
0
 def p(self, j, name):
     return printing.Print(name)(j)
Пример #16
0
 def printme(self, name, mat):
     return printing.Print('vector')(mat)
Пример #17
0
 def printdim(self, name, mat):
     return printing.Print(name, attrs=['shape'])(mat)
Пример #18
0
 def print_me(self, name, mat):
     mat = printing.Print('vector')(mat)
     return mat
Пример #19
0
 def print_dim(self, name, mat):
     mat = printing.Print(name, attrs=['shape'])(mat)
     return mat
Пример #20
0
def print_shape(A):
    print_op = printf.Print('vector', attrs=['shape'])
    printed = print_op(A)
    f = function([A], printed)
    return f(A)
Пример #21
0
    def __theano_build__(self):
        E_x, E_y, V, U, W, b, c = self.E_x, self.E_y, self.V, self.U, self.W, self.b, self.c

        x = T.ivector('x')
        y = T.ivector('y')
        x_label = T.ivector('x')
        y_label = T.ivector('y')

        def forward_prop_step(x_t, y_t, s_t_prev):
            # Word embedding layer
            x_e = E_x[:, x_t]
            y_e = E_y[:, y_t]

            def GRU(i, U, W, b, x_0, s_prev):
                pb = printing.Print('b')

                b1 = T.specify_shape((self.conversion_ones * b[i * 3, :]).T, T.shape(x_0))
                b2 = T.specify_shape((self.conversion_ones * b[i * 3 + 1, :]).T, T.shape(x_0))
                b3 = T.specify_shape((self.conversion_ones * b[i * 3 + 2, :]).T, T.shape(x_0))

                z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_prev) + b1)
                r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_prev) + b2)
                c = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_prev * r) + b3)

                return (T.ones_like(z) - z) * c + z * s_prev

            p_o = printing.Print('juju')

            # GRU Layer 1
            s[0] = GRU(0, U, W, b, x_e, s_t_prev[0])

            # GRU Layer 2
            s[1] = GRU(1, U, W, b, y_e, s_t_prev[1])

            c_matrix = (self.conversion_ones * c).T
            juju = V.dot(s) + c_matrix
            ot = printing.Print("o_t")
            o_t = T.nnet.softmax(juju.T).T

            return [o_t, s]

        [o, s], updates = theano.scan(
            forward_prop_step,
            sequences=x,
            truncate_gradient=self.bptt_truncate,
            outputs_info=[None,
                          dict(initial=T.zeros(self.hidden_dim)),
                          dict(initial=T.zeros(self.hidden_dim)),
                          dict(initial=T.zeros(self.hidden_dim))])

        prediction = T.argmax(o, axis=1)
        o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
        p_o = printing.Print('o_error')
        # Total cost (could add regularization here)
        cost = p_o(o_error)

        # Gradients
        dE_x = T.grad(cost, E_x)
        dE_y = T.grad(cost, E_y)
        dU = T.grad(cost, U)
        dW = T.grad(cost, W)
        db = T.grad(cost, b)
        dV = T.grad(cost, V)
        dc = T.grad(cost, c)

        # Assign functions
        self.predict = theano.function([x], [o], allow_input_downcast=True)
        self.predict_class = theano.function([x], prediction, allow_input_downcast=True)
        self.ce_error = theano.function([x, y], cost, allow_input_downcast=True)
        self.bptt = theano.function([x, y], [dE_x, dE_y, dU, dW, db, dV, dc], allow_input_downcast=True)

        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        # rmsprop cache updates
        mE_x = decay * self.mE_x + (1 - decay) * dE_x ** 2
        mE_y = decay * self.mE_y + (1 - decay) * dE_y ** 2
        mU = decay * self.mU + (1 - decay) * dU ** 2
        mW = decay * self.mW + (1 - decay) * dW ** 2
        mV = decay * self.mV + (1 - decay) * dV ** 2
        mb = decay * self.mb + (1 - decay) * db ** 2
        mc = decay * self.mc + (1 - decay) * dc ** 2

        self.sgd_step = theano.function(
            [x, y, learning_rate, theano.In(decay, value=0.9)],
            [],
            updates=[(E_x, E_x - learning_rate * dE_x / T.sqrt(dE_x + 1e-6)),
                     (E_y, E_y - learning_rate * dE_y / T.sqrt(dE_y + 1e-6)),
                     (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
                     (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
                     (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
                     (b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
                     (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
                     (self.mE_x, mE_x),
                     (self.mE_y, mE_y),
                     (self.mU, mU),
                     (self.mW, mW),
                     (self.mV, mV),
                     (self.mb, mb),
                     (self.mc, mc)
                     ], allow_input_downcast=True)
Пример #22
0
    def create_gradientfunctions(self, train_data, train_labels, val_data,
                                 val_labels):
        """This function takes as input the whole dataset and creates the entire model"""
        def encodingstep(x_t, h_t):
            z_t = T.nnet.sigmoid(
                T.dot(x_t, self.params['U_z']) +
                T.dot(h_t, self.params['W_z']).squeeze() +
                self.params['b_z'].squeeze())
            r_t = T.nnet.sigmoid(
                T.dot(x_t, self.params['U_r']) +
                T.dot(h_t, self.params['W_r']).squeeze() +
                self.params['b_r'].squeeze())
            h = T.tanh(
                T.dot(x_t, self.params['U_h']) +
                T.dot(h_t * r_t, self.params['W_h']) +
                self.params['b_h'].squeeze())
            new_h_t = (1 - z_t) * h + z_t * h_t
            return new_h_t

        x = T.tensor3("x")

        h0_enc = T.matrix("h0_enc")
        result, _ = theano.scan(encodingstep, sequences=x, outputs_info=h0_enc)

        h_encoder = result[-1]

        #log sigma encoder is squared
        mu_encoder = T.dot(
            h_encoder, self.params["W_hmu"]) + self.params["b_hmu"].squeeze()
        log_sigma_encoder = T.dot(
            h_encoder,
            self.params["W_hsigma"]) + self.params["b_hsigma"].squeeze()

        #Use a very wide prior to make it possible to learn something with Z
        #logpz = 0.005 * T.sum(1 + log_sigma_encoder - mu_encoder**2 - T.exp(log_sigma_encoder), axis = 1)
        logpz = 0.5 * T.sum(
            1 + log_sigma_encoder - mu_encoder**2 - T.exp(log_sigma_encoder),
            axis=1)

        if "gpu" in theano.config.device:
            srng = theano.sandbox.cuda.rng_curand.CURAND_RandomStreams()
        else:
            srng = T.shared_randomstreams.RandomStreams()

        #Reparametrize Z
        eps = srng.normal((x.shape[1], self.latent_variables),
                          avg=0.0,
                          std=1.0,
                          dtype=theano.config.floatX)
        z = mu_encoder + T.exp(0.5 * log_sigma_encoder) * eps

        h0_dec = T.tanh(
            T.dot(z, self.params["W_zh"]) + self.params["b_zh"].squeeze())

        def decodingstep(x_t, h_t):
            z_dec_t = T.nnet.sigmoid(
                T.dot(x_t, self.params['U_dec_z']) +
                T.dot(h_t, self.params['W_dec_z']) +
                self.params['b_dec_z'].squeeze())
            r_dec_t = T.nnet.sigmoid(
                T.dot(x_t, self.params['U_dec_r']) +
                T.dot(h_t, self.params['W_dec_r']) +
                self.params['b_dec_r'].squeeze())
            h = T.tanh(
                T.dot(x_t, self.params['U_dec_h']) +
                T.dot(h_t * r_dec_t, self.params['W_dec_h']) +
                self.params['b_dec_h'].squeeze())
            new_h_t = (1 - z_dec_t) * h + z_dec_t * h_t
            new_x_t = T.tanh(
                h.dot(self.params["W_hx"]) + self.params["b_hx"].squeeze())
            return new_x_t, new_h_t

        x0 = T.matrix("x0")
        [y, _], _ = theano.scan(decodingstep,
                                n_steps=x.shape[0],
                                outputs_info=[x0, h0_dec])

        # Clip y to avoid NaNs, necessary when lowerbound goes to 0
        # 128 x 8 x 35
        y = T.clip(y, -1 + 1e-6, 1 - 1e-6)
        logpxz = -T.sum(T.pow(y - x, 2), axis=0)
        logpxz = T.mean(logpxz, axis=1)

        #Average over batch dimension
        logpx = T.mean(logpxz + logpz)

        #Driver output
        batch_start = T.iscalar('batch_start')
        batch_end = T.iscalar('batch_end')
        labels = T.ivector('labels')
        train_labels = theano.shared(train_labels.astype('int32'))
        val_labels = theano.shared(val_labels.astype('int32'))
        keep_prob = T.scalar(dtype=theano.config.floatX)

        mask = self.srng.binomial(p=keep_prob,
                                  size=(self.hidden_units_encoder, )).astype(
                                      theano.config.floatX) / keep_prob
        printer = printing.Print('')

        driver_output = T.nnet.softmax(
            T.dot(h_encoder * mask, self.params['W_driver']) +
            self.params['b_driver'].squeeze())

        max_minus_min = (driver_output.max(axis=0) -
                         driver_output.min(axis=0)).sum()
        var = (driver_output.var(axis=0)).sum()
        mean = (driver_output.mean(axis=0)).sum()

        cross_entropy = T.nnet.categorical_crossentropy(driver_output, labels)

        driver_loss = (-T.mean(cross_entropy))
        l1_loss = (-T.sum([T.sum(abs(v)) for v in self.params.values()]))
        l2_loss = (-T.sum([T.sum(v**2) for v in self.params.values()]))

        #Compute all the gradients
        total_loss = ((1 - self.lamda1) * logpx + self.lamda1 * driver_loss +
                      self.lamda_l2 * l2_loss + self.lamda_l1 * l1_loss)
        gradients = T.grad(total_loss,
                           self.params.values(),
                           disconnected_inputs='ignore')

        #Let Theano handle the updates on parameters for speed
        updates = OrderedDict()
        epoch = T.iscalar("epoch")
        gamma = (T.sqrt(1 - (1 - self.b2)**epoch) /
                 (1 - (1 - self.b1)**epoch)).astype(theano.config.floatX)

        #Adam
        for parameter, gradient, m, v in zip(self.params.values(), gradients,
                                             self.m.values(), self.v.values()):
            new_m = self.b1 * gradient + (1 - self.b1) * m
            new_v = self.b2 * (gradient**2) + (1 - self.b2) * v

            updates[
                parameter] = parameter + self.learning_rate * gamma * new_m / (
                    T.sqrt(new_v) + 1e-8)
            updates[m] = new_m
            updates[v] = new_v

        train_data = theano.shared(train_data.transpose(1, 0, 2)).astype(
            theano.config.floatX)

        givens = {
            h0_enc:
            T.zeros((batch_end - batch_start,
                     self.hidden_units_encoder)).astype(theano.config.floatX),
            x0:
            T.zeros((batch_end - batch_start,
                     self.features)).astype(theano.config.floatX),
            x:
            train_data[:, batch_start:batch_end, :],
            labels:
            train_labels[batch_start:batch_end],
            keep_prob:
            self.keep_prob
        }

        self.updatefunction = theano.function([epoch, batch_start, batch_end],
                                              [logpxz.mean(), driver_loss],
                                              updates=updates,
                                              givens=givens,
                                              allow_input_downcast=True)

        x_val = theano.shared(val_data.transpose(1, 0, 2)).astype(
            theano.config.floatX)
        givens[x] = x_val[:, batch_start:batch_end, :]
        givens[labels] = val_labels[batch_start:batch_end]
        givens[keep_prob] = np.array(1.0).astype(theano.config.floatX)
        self.likelihood = theano.function(
            [batch_start, batch_end],
            [logpxz.mean(), driver_loss, max_minus_min, var, mean],
            givens=givens)

        x_test = T.tensor3("x_test")
        test_givens = {
            x:
            x_test,
            h0_enc:
            T.zeros((x_test.shape[1],
                     self.hidden_units_encoder)).astype(theano.config.floatX),
        }

        self.encoder = theano.function([x_test], h_encoder, givens=test_givens)
        h_e = T.matrix('h_e')
        self.driver_predict = theano.function([h_e],
                                              driver_output,
                                              givens={
                                                  h_encoder:
                                                  h_e,
                                                  keep_prob:
                                                  np.array(1.0).astype(
                                                      theano.config.floatX)
                                              })

        return True
Пример #23
0
test_data = [[[1, 2], [2, 3], [3, 4]], [[2, 1], [2, 2], [3, 3]]]
shared_x = theano.shared(np.asarray(test_data, dtype=theano.config.floatX),
                         borrow=True)

inpt = T.matrix("inpt")

cost = inpt * inpt

i = T.lscalar("i")
fn = theano.function([i], cost, givens={inpt: shared_x[i]})
z = fn(0)
#print z

test_q = np.array([[1, 2], [2, 3]])

hello_world_op = printing.Print('hello world')
printed_x = hello_world_op(x)
f = theano.function([x], printed_x)

shared_xx = shared_x.reshape((6, 2))
shared_q = theano.shared(np.asarray(test_q, dtype=theano.config.floatX),
                         borrow=True)

x = T.lmatrix("x")
print x.shape.eval({x: test_q})
print theano.function(inputs=[x], outputs=x.shape)(test_q)
print test_q.shape

fx = T.dmatrix("f")
print theano.function(inputs=[fx], outputs=fx.shape)(shared_xx.eval())
Пример #24
0
 def p(j, name):
     return printing.Print(name)(j)
Пример #25
0
    def __init__(self, data_dir, word2vec, word_vector_size, truncate_gradient,
                 learning_rate, dim, cnn_dim, cnn_dim_fc, story_len, patches,
                 mode, answer_module, memory_hops, batch_size, l2,
                 normalize_attention, batch_norm, dropout, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()

        self.data_dir = data_dir
        self.learning_rate = learning_rate

        self.truncate_gradient = truncate_gradient
        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.cnn_dim = cnn_dim
        self.cnn_dim_fc = cnn_dim_fc
        self.story_len = story_len
        self.mode = mode
        self.patches = patches
        self.answer_module = answer_module
        self.memory_hops = memory_hops
        self.batch_size = batch_size
        self.l2 = l2
        self.normalize_attention = normalize_attention
        self.batch_norm = batch_norm
        self.dropout = dropout

        self.vocab, self.ivocab = self._load_vocab(self.data_dir)

        self.train_story = None
        self.test_story = None
        self.train_dict_story, self.train_lmdb_env_fc, self.train_lmdb_env_conv = self._process_input_sind(
            self.data_dir, 'train')
        self.test_dict_story, self.test_lmdb_env_fc, self.test_lmdb_env_conv = self._process_input_sind(
            self.data_dir, 'val')

        self.train_story = self.train_dict_story.keys()
        self.test_story = self.test_dict_story.keys()
        self.vocab_size = len(self.vocab)

        # Since this is pretty expensive, we will pass a story each time.
        # We assume that the input has been processed such that the sequences of patches
        # are snake like path.

        self.input_var = T.tensor4(
            'input_var')  # (batch_size, seq_len, patches, cnn_dim)
        self.q_var = T.matrix('q_var')  # Now, it's a batch * image_sieze.
        self.answer_var = T.imatrix(
            'answer_var')  # answer of example in minibatch
        self.answer_mask = T.matrix('answer_mask')
        self.answer_inp_var = T.tensor3(
            'answer_inp_var')  # answer of example in minibatch

        print "==> building input module"
        self.W_inp_emb_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim,
                                                         self.cnn_dim))
        #self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        # First, we embed the visual features before sending it to the bi-GRUs.

        inp_rhp = T.reshape(
            self.input_var,
            (self.batch_size * self.story_len * self.patches, self.cnn_dim))
        inp_rhp_dimshuffled = inp_rhp.dimshuffle(1, 0)
        inp_rhp_emb = T.dot(self.W_inp_emb_in, inp_rhp_dimshuffled)
        inp_rhp_emb_dimshuffled = inp_rhp_emb.dimshuffle(1, 0)
        inp_emb_raw = T.reshape(
            inp_rhp_emb_dimshuffled,
            (self.batch_size, self.story_len, self.patches, self.cnn_dim))
        inp_emb = T.tanh(
            inp_emb_raw
        )  # Just follow the paper DMN for visual and textual QA.

        # Now, we use a bi-directional GRU to produce the input.
        # Forward GRU.
        self.inp_dim = self.dim / 2  # since we have forward and backward
        self.W_inpf_res_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.inp_dim,
                                                          self.cnn_dim))
        self.W_inpf_res_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.inp_dim,
                                                           self.inp_dim))
        self.b_inpf_res = nn_utils.constant_param(value=0.0,
                                                  shape=(self.inp_dim, ))

        self.W_inpf_upd_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.inp_dim,
                                                          self.cnn_dim))
        self.W_inpf_upd_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.inp_dim,
                                                           self.inp_dim))
        self.b_inpf_upd = nn_utils.constant_param(value=0.0,
                                                  shape=(self.inp_dim, ))

        self.W_inpf_hid_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.inp_dim,
                                                          self.cnn_dim))
        self.W_inpf_hid_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.inp_dim,
                                                           self.inp_dim))
        self.b_inpf_hid = nn_utils.constant_param(value=0.0,
                                                  shape=(self.inp_dim, ))
        # Backward GRU.
        self.W_inpb_res_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.inp_dim,
                                                          self.cnn_dim))
        self.W_inpb_res_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.inp_dim,
                                                           self.inp_dim))
        self.b_inpb_res = nn_utils.constant_param(value=0.0,
                                                  shape=(self.inp_dim, ))

        self.W_inpb_upd_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.inp_dim,
                                                          self.cnn_dim))
        self.W_inpb_upd_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.inp_dim,
                                                           self.inp_dim))
        self.b_inpb_upd = nn_utils.constant_param(value=0.0,
                                                  shape=(self.inp_dim, ))

        self.W_inpb_hid_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.inp_dim,
                                                          self.cnn_dim))
        self.W_inpb_hid_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.inp_dim,
                                                           self.inp_dim))
        self.b_inpb_hid = nn_utils.constant_param(value=0.0,
                                                  shape=(self.inp_dim, ))

        # Now, we use the GRU to build the inputs.
        # Two-level of nested scan is unnecessary. It will become too complicated. Just use this one.
        inp_dummy = theano.shared(
            np.zeros((self.inp_dim, self.story_len), dtype=floatX))
        for i in range(self.batch_size):
            if i == 0:
                inp_1st_f, _ = theano.scan(
                    fn=self.input_gru_step_forward,
                    sequences=inp_emb[i, :].dimshuffle(1, 2, 0),
                    outputs_info=T.zeros_like(inp_dummy),
                    truncate_gradient=self.truncate_gradient)

                inp_1st_b, _ = theano.scan(
                    fn=self.input_gru_step_backward,
                    sequences=inp_emb[i, :, ::-1, :].dimshuffle(1, 2, 0),
                    outputs_info=T.zeros_like(inp_dummy),
                    truncate_gradient=self.truncate_gradient)
                # Now, combine them.
                inp_1st = T.concatenate([
                    inp_1st_f.dimshuffle(2, 0, 1),
                    inp_1st_b.dimshuffle(2, 0, 1)
                ],
                                        axis=-1)
                self.inp_c = inp_1st.dimshuffle('x', 0, 1, 2)
            else:
                inp_f, _ = theano.scan(
                    fn=self.input_gru_step_forward,
                    sequences=inp_emb[i, :].dimshuffle(1, 2, 0),
                    outputs_info=T.zeros_like(inp_dummy),
                    truncate_gradient=self.truncate_gradient)

                inp_b, _ = theano.scan(
                    fn=self.input_gru_step_backward,
                    sequences=inp_emb[i, :, ::-1, :].dimshuffle(1, 2, 0),
                    outputs_info=T.zeros_like(inp_dummy),
                    truncate_gradient=self.truncate_gradient)
                # Now, combine them.
                inp_fb = T.concatenate(
                    [inp_f.dimshuffle(2, 0, 1),
                     inp_b.dimshuffle(2, 0, 1)],
                    axis=-1)
                self.inp_c = T.concatenate(
                    [self.inp_c, inp_fb.dimshuffle('x', 0, 1, 2)], axis=0)
        # Done, now self.inp_c should be batch_size x story_len x patches x cnn_dim
        # Eventually, we can flattern them.
        # Now, the input dimension is 1024 because we have forward and backward.
        inp_c_t = T.reshape(
            self.inp_c,
            (self.batch_size, self.story_len * self.patches, self.dim))
        inp_c_t_dimshuffled = inp_c_t.dimshuffle(0, 'x', 1, 2)
        inp_batch = T.repeat(inp_c_t_dimshuffled, self.story_len, axis=1)
        # Now, its ready for all the 5 images in the same story.
        # 50 * 980 * 512
        self.inp_batch = T.reshape(inp_batch,
                                   (inp_batch.shape[0] * inp_batch.shape[1],
                                    inp_batch.shape[2], inp_batch.shape[3]))
        self.inp_batch_dimshuffled = self.inp_batch.dimshuffle(
            1, 2, 0)  # 980 x 512 x 50

        # It's very simple now, the input module just need to map from cnn_dim to dim.
        logging.info('self.cnn_dim = %d', self.cnn_dim)

        print "==> building question module"
        # First is for the global glimpse.

        q_var_3 = T.reshape(self.q_var,
                            (self.batch_size, self.story_len, self.cnn_dim_fc))

        q_var_shuffled = q_var_3.dimshuffle(
            1, 2, 0)  # now: story_len * image_size * batch_size

        # This is the RNN used to produce the Global Glimpse
        self.W_qf_res_in = nn_utils.normal_param(std=0.1,
                                                 shape=(self.dim,
                                                        self.cnn_dim_fc))
        self.W_qf_res_hid = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.b_qf_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_qf_upd_in = nn_utils.normal_param(std=0.1,
                                                 shape=(self.dim,
                                                        self.cnn_dim_fc))
        self.W_qf_upd_hid = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.b_qf_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_qf_hid_in = nn_utils.normal_param(std=0.1,
                                                 shape=(self.dim,
                                                        self.cnn_dim_fc))
        self.W_qf_hid_hid = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.b_qf_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        inp_dummy = theano.shared(
            np.zeros((self.dim, self.batch_size), dtype=floatX))

        q_glb, _ = theano.scan(fn=self.q_gru_step_forward,
                               sequences=q_var_shuffled,
                               outputs_info=[T.zeros_like(inp_dummy)],
                               truncate_gradient=self.truncate_gradient)
        q_glb_shuffled = q_glb.dimshuffle(2, 0,
                                          1)  # batch_size * seq_len * dim
        q_glb_last = q_glb_shuffled[:, -1, :]  # batch_size * dim

        # Now, we also need to add the global glimpse, thus we need to use the rnn to build the attention glimpose.
        # Now, share the parameter with the input module.
        self.W_inp_emb_q = nn_utils.normal_param(std=0.1,
                                                 shape=(self.dim,
                                                        self.cnn_dim_fc))
        self.b_inp_emb_q = nn_utils.normal_param(std=0.1, shape=(self.dim, ))
        q_var_shuffled = self.q_var.dimshuffle(1, 0)

        inp_q = T.dot(
            self.W_inp_emb_q, q_var_shuffled) + self.b_inp_emb_q.dimshuffle(
                0, 'x')  # 512 x 50
        self.q_q = T.tanh(
            inp_q
        )  # Since this is used to initialize the memory, we need to make it tanh.

        print "==> creating parameters for memory module"
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.dim, 7 * self.dim + 0))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, ))

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]
        for iter in range(1, self.memory_hops + 1):
            #m = printing.Print('mem')(memory[iter-1])
            current_episode = self.new_episode(memory[iter - 1])
            #current_episode = self.new_episode(m)
            #current_episode = printing.Print('current_episode')(current_episode)
            memory.append(
                self.GRU_update(memory[iter - 1], current_episode,
                                self.W_mem_res_in, self.W_mem_res_hid,
                                self.b_mem_res, self.W_mem_upd_in,
                                self.W_mem_upd_hid, self.b_mem_upd,
                                self.W_mem_hid_in, self.W_mem_hid_hid,
                                self.b_mem_hid))

        last_mem_raw = memory[-1].dimshuffle((1, 0))

        net = layers.InputLayer(shape=(self.batch_size * self.story_len,
                                       self.dim),
                                input_var=last_mem_raw)

        if self.batch_norm:
            net = layers.BatchNormLayer(incoming=net)
        if self.dropout > 0 and self.mode == 'train':
            net = layers.DropoutLayer(net, p=self.dropout)
        last_mem = layers.get_output(net).dimshuffle((1, 0))

        logging.info('last_mem size')
        print last_mem.shape.eval({
            self.input_var:
            np.random.rand(10, 5, 196, 512).astype('float32'),
            self.q_var:
            np.random.rand(50, 4096).astype('float32')
        })

        print "==> building answer module"

        answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1, 2, 0)
        # Sounds good. Now, we need to map last_mem to a new space.
        self.W_mem_emb = nn_utils.normal_param(std=0.1,
                                               shape=(self.dim, self.dim * 3))
        self.W_inp_emb = nn_utils.normal_param(std=0.1,
                                               shape=(self.dim,
                                                      self.vocab_size + 1))

        def _dot2(x, W):
            return T.dot(W, x)

        answer_inp_var_shuffled_emb, _ = theano.scan(
            fn=_dot2,
            sequences=answer_inp_var_shuffled,
            non_sequences=self.W_inp_emb,
            truncate_gradient=self.truncate_gradient)  # seq x dim x batch

        # Now, we also need to embed the image and use it to do the memory.
        #q_q_shuffled = self.q_q.dimshuffle(1,0) # dim * batch.
        q_glb_dim = q_glb_last.dimshuffle(0, 'x', 1)  # batch_size * 1 * dim
        q_glb_repmat = T.repeat(q_glb_dim, self.story_len,
                                1)  # batch_size * len * dim
        q_glb_rhp = T.reshape(q_glb_repmat,
                              (q_glb_repmat.shape[0] * q_glb_repmat.shape[1],
                               q_glb_repmat.shape[2]))

        init_ans = T.concatenate(
            [self.q_q, last_mem,
             q_glb_rhp.dimshuffle(1, 0)], axis=0)

        mem_ans = T.dot(self.W_mem_emb, init_ans)  # dim x batchsize.
        mem_ans = printing.Print('prob_sm')(mem_ans)

        mem_ans_dim = mem_ans.dimshuffle('x', 0, 1)

        answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb],
                                   axis=0)

        # Now, we have both embedding. We can let them go to the rnn.

        # We also need to map the input layer as well.

        dummy = theano.shared(
            np.zeros((self.dim, self.batch_size * self.story_len),
                     dtype=floatX))

        self.W_a = nn_utils.normal_param(std=0.1,
                                         shape=(self.vocab_size + 1, self.dim))

        self.W_ans_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_ans_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_ans_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_ans_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_ans_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_ans_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        logging.info('answer_inp size')

        #print answer_inp.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32')})

        #last_mem = printing.Print('prob_sm')(last_mem)
        results, _ = theano.scan(fn=self.answer_gru_step,
                                 sequences=answer_inp,
                                 outputs_info=[dummy],
                                 truncate_gradient=self.truncate_gradient)
        # Assume there is a start token
        #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32'),
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore')
        results = results[
            1:
            -1, :, :]  # get rid of the last token as well as the first one (image)
        #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32'),
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore')

        # Now, we need to transform it to the probabilities.

        prob, _ = theano.scan(fn=lambda x, w: T.dot(w, x),
                              sequences=results,
                              non_sequences=self.W_a,
                              truncate_gradient=self.truncate_gradient)

        prob_shuffled = prob.dimshuffle(2, 0, 1)  # b * len * vocab

        logging.info("prob shape.")
        #print prob.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32'),
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')})

        n = prob_shuffled.shape[0] * prob_shuffled.shape[1]
        prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2]))
        prob_sm = nn_utils.softmax_(prob_rhp)
        self.prediction = prob_sm

        mask = T.reshape(self.answer_mask, (n, ))
        lbl = T.reshape(self.answer_var, (n, ))

        self.params = [
            self.W_inp_emb_in,  #self.b_inp_emb_in, 
            self.W_inpf_res_in,
            self.W_inpf_res_hid,
            self.b_inpf_res,
            self.W_inpf_upd_in,
            self.W_inpf_upd_hid,
            self.b_inpf_upd,
            self.W_inpf_hid_in,
            self.W_inpf_hid_hid,
            self.b_inpf_hid,
            self.W_inpb_res_in,
            self.W_inpb_res_hid,
            self.b_inpb_res,
            self.W_inpb_upd_in,
            self.W_inpb_upd_hid,
            self.b_inpb_upd,
            self.W_inpb_hid_in,
            self.W_inpb_hid_hid,
            self.b_inpb_hid,
            self.W_qf_res_in,
            self.W_qf_res_hid,
            self.b_qf_res,
            self.W_qf_upd_in,
            self.W_qf_upd_hid,
            self.b_qf_upd,
            self.W_qf_hid_in,
            self.W_qf_hid_hid,
            self.b_qf_hid,
            self.W_inp_emb_q,
            self.b_inp_emb_q,
            self.W_mem_res_in,
            self.W_mem_res_hid,
            self.b_mem_res,
            self.W_mem_upd_in,
            self.W_mem_upd_hid,
            self.b_mem_upd,
            self.W_mem_hid_in,
            self.W_mem_hid_hid,
            self.b_mem_hid,  #self.W_b
            self.W_1,
            self.W_2,
            self.b_1,
            self.b_2,
            self.W_a,
            self.W_mem_emb,
            self.W_inp_emb,
            self.W_ans_res_in,
            self.W_ans_res_hid,
            self.b_ans_res,
            self.W_ans_upd_in,
            self.W_ans_upd_hid,
            self.b_ans_upd,
            self.W_ans_hid_in,
            self.W_ans_hid_hid,
            self.b_ans_hid,
        ]

        print "==> building loss layer and computing updates"
        loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl)
        self.loss_ce = (mask * loss_vec).sum() / mask.sum()

        #self.loss_ce = T.nnet.categorical_crossentropy(results_rhp, lbl)

        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.adadelta(self.loss,
                                           self.params,
                                           learning_rate=self.learning_rate)
        #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.answer_var,
                    self.answer_mask, self.answer_inp_var
                ],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(inputs=[
            self.input_var, self.q_var, self.answer_var, self.answer_mask,
            self.answer_inp_var
        ],
                                       outputs=[self.prediction, self.loss])
Пример #26
0
def watch(x):
    def func(_, x):
        import ipdb; ipdb.set_trace()
    return TP.Print(global_fn=func)(x)
Пример #27
0
    def __theano_build__(self):
        E, V, U, W, b, c, v = self.E, self.V, self.U, self.W, self.b, self.c, self.v
        x = T.dvector('x')
        y = T.dvector('y')

        def forward_prop_step(x_t, s_t1_prev, s_t2_prev, s_t3_prev):
            p_o = printing.Print('juju')
            # Word embedding layer
            x_e = x_t.dot(E).T + v

            def GRU(i, U, W, b, x_0, s_prev):
                b1 = b[i * 3, :]
                b2 = b[i * 3 + 1, :]
                b3 = b[i * 3 + 2, :]

                z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_prev) + b1)
                r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_prev) + b2)
                c = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_prev * r) + b3)

                return ((T.ones_like(z) - z) * c + z * s_prev).astype(theano.config.floatX)

            p_o = printing.Print('juju')
            s = [[], [], []]
            # GRU Layer 1
            s[0] = GRU(0, U, W, b, x_e, s_t1_prev)

            # GRU Layer 2
            s[1] = GRU(1, U, W, b, s[0], s_t2_prev)

            # GRU Layer 3
            s[2] = GRU(2, U, W, b, s[1], s_t3_prev)

            # Final output calculation

            o_t = (V.dot(s[2]) + c)[0]

            return [o_t, s[0], s[1], s[2]]

        # p_o = printing.Print('prediction')
        [o, s, s2, s3], updates = theano.scan(
            forward_prop_step,
            sequences=x,
            truncate_gradient=self.bptt_truncate,
            outputs_info=[None,
                          dict(initial=T.zeros(self.hidden_dim)),
                          dict(initial=T.zeros(self.hidden_dim)),
                          dict(initial=T.zeros(self.hidden_dim))])

        p_o = printing.Print('o')
        # p_y = printing.Print('y')
        prediction = o
        e = prediction - y
        o_last = o[-1]
        o_error = T.sum(T.pow(prediction.T - y, 2)) / (2 * T.shape(y)[1])
        # Total cost (could add regularization here)
        cost = o_error

        # Gradients
        dE = T.grad(cost, E)
        dU = T.grad(cost, U)
        dW = T.grad(cost, W)
        db = T.grad(cost, b)
        dV = T.grad(cost, V)
        dc = T.grad(cost, c)
        dv = T.grad(cost, v)

        # Assign functions
        self.predict = theano.function([x], [o])
        self.predict_last = theano.function([x], [o_last])
        self.predict_class = theano.function([x, y], [prediction, e], allow_input_downcast=True)
        self.error = theano.function([x, y], e)
        self.ce_error = theano.function([x, y], cost, allow_input_downcast=True)
        self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc], allow_input_downcast=True)

        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        # rmsprop cache updates
        mE = (decay * self.mE + (1 - decay) * dE ** 2).astype(theano.config.floatX)
        mU = (decay * self.mU + (1 - decay) * dU ** 2).astype(theano.config.floatX)
        mW = (decay * self.mW + (1 - decay) * dW ** 2).astype(theano.config.floatX)
        mV = (decay * self.mV + (1 - decay) * dV ** 2).astype(theano.config.floatX)
        mb = (decay * self.mb + (1 - decay) * db ** 2).astype(theano.config.floatX)
        mc = (decay * self.mc + (1 - decay) * dc ** 2).astype(theano.config.floatX)
        mv = (decay * self.mv + (1 - decay) * dv ** 2).astype(theano.config.floatX)

        self.sgd_step = theano.function(
            [x, y, learning_rate, theano.In(decay, value=0.9)],
            [],
            updates=[(E, E - (learning_rate * dE / T.sqrt(mE + 1e-6)).astype(theano.config.floatX)),
                     (U, U - (learning_rate * dU / T.sqrt(mU + 1e-6)).astype(theano.config.floatX)),
                     (W, W - (learning_rate * dW / T.sqrt(mW + 1e-6)).astype(theano.config.floatX)),
                     (V, V - (learning_rate * dV / T.sqrt(mV + 1e-6)).astype(theano.config.floatX)),
                     (b, b - (learning_rate * db / T.sqrt(mb + 1e-6)).astype(theano.config.floatX)),
                     (c, c - (learning_rate * dc / T.sqrt(mc + 1e-6)).astype(theano.config.floatX)),
                     (self.mE, mE),
                     (self.mU, mU),
                     (self.mW, mW),
                     (self.mV, mV),
                     (self.mb, mb),
                     (self.mc, mc),
                     (self.mv, mv)
                     ], allow_input_downcast=True)
Пример #28
0
from keras import backend as K
from keras.models import Sequential
from keras.engine.topology import Layer
from keras.engine import InputSpec
from keras import initializations, activations
import theano
import theano.tensor as T
import theano.printing as printing
from keras.backend.common import _EPSILON
from theano.tensor.shared_randomstreams import RandomStreams
import numpy as np
epsilon = 0.1
p = printing.Print('x')

# USE BELOW TAGS FOR DEBUGGING
# theano.config.optimizer = 'None'
# theano.config.exception_verbosity ='high'
# theano.optimizer='fast_compile'


def get_vector(curr_word, new_sense, W_g, W_s):
    cond = T.eq(new_sense, -1)
    return T.switch(cond, W_g[curr_word], W_s[curr_word, new_sense])


# update the sense of a word in the context vector
def change_context_vec(vect, new_sense, prev_sense, curr_word, W_g, W_s):
    return vect - get_vector(curr_word, prev_sense, W_g, W_s) + get_vector(
        curr_word, new_sense, W_g, W_s)

Пример #29
0
    def __theano_build__(self):
        E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c

        x = T.ivector('x')
        y = T.ivector('y')

        def forward_prop_step(x_t, s_t1_prev, s_t2_prev, s_t3_prev):
            # Word embedding layer
            x_e = E[:, x_t]

            # GRU Layer 1
            z_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) +
                                       b[0])
            r_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) +
                                       b[1])
            c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2])
            s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev

            # GRU Layer 2
            z_t2 = T.nnet.hard_sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) +
                                       b[3])
            r_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) +
                                       b[4])
            c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5])
            s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev

            # GRU Layer 3
            z_t3 = T.nnet.hard_sigmoid(U[6].dot(s_t2) + W[6].dot(s_t3_prev) +
                                       b[6])
            r_t3 = T.nnet.hard_sigmoid(U[7].dot(s_t2) + W[7].dot(s_t3_prev) +
                                       b[7])
            c_t3 = T.tanh(U[8].dot(s_t2) + W[8].dot(s_t3_prev * r_t3) + b[8])
            s_t3 = (T.ones_like(z_t3) - z_t3) * c_t3 + z_t3 * s_t3_prev

            # Final output calculation
            # Theano's softmax returns a matrix with one row, we only need the row
            o_t = T.nnet.softmax(V.dot(s_t3) + c)[0]

            return [o_t, s_t1, s_t2, s_t3]

        [o, s, s2,
         s3], updates = theano.scan(forward_prop_step,
                                    sequences=x,
                                    truncate_gradient=self.bptt_truncate,
                                    outputs_info=[
                                        None,
                                        dict(initial=T.zeros(self.hidden_dim)),
                                        dict(initial=T.zeros(self.hidden_dim)),
                                        dict(initial=T.zeros(self.hidden_dim))
                                    ])

        prediction = T.argmax(o, axis=1)
        o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
        p_o = printing.Print('o_error')
        # Total cost (could add regularization here)
        cost = p_o(o_error)

        # Gradients
        dE = T.grad(cost, E)
        dU = T.grad(cost, U)
        dW = T.grad(cost, W)
        db = T.grad(cost, b)
        dV = T.grad(cost, V)
        dc = T.grad(cost, c)

        # Assign functions
        self.predict = theano.function([x], [o], allow_input_downcast=True)
        self.predict_class = theano.function([x],
                                             prediction,
                                             allow_input_downcast=True)
        self.ce_error = theano.function([x, y],
                                        cost,
                                        allow_input_downcast=True)
        self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc],
                                    allow_input_downcast=True)

        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        # rmsprop cache updates
        mE = decay * self.mE + (1 - decay) * dE**2
        mU = decay * self.mU + (1 - decay) * dU**2
        mW = decay * self.mW + (1 - decay) * dW**2
        mV = decay * self.mV + (1 - decay) * dV**2
        mb = decay * self.mb + (1 - decay) * db**2
        mc = decay * self.mc + (1 - decay) * dc**2

        self.sgd_step = theano.function(
            [x, y, learning_rate,
             theano.In(decay, value=0.9)], [],
            updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)),
                     (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
                     (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
                     (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
                     (b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
                     (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
                     (self.mE, mE), (self.mU, mU), (self.mW, mW),
                     (self.mV, mV), (self.mb, mb), (self.mc, mc)],
            allow_input_downcast=True)
Пример #30
0
def _sinkhorn_log(Mu, Nu, C, options):
    """
	Theano symbolic version.
	Note that the gradient wrt Mu, Nu and C is computed as if
	the transport plan "gamma(Mu,Nu,C)" was piecewise constant.
	"""
    # First, load the parameters.
    epsilon = options.epsilon  # regularization parameter
    niter = options.niter  # max niter in the sinkhorn loop
    tau = options.tau  # use for acceleration
    rho = options.rho  # parameter for unbalanced transport
    use_dual_cost = options.dual_cost  # If False, use the primal cost
    discard_entropy = options.discard_entropy  # If True + primal cost, remove the -eps*H(gamma)
    discard_KL = options.discard_KL  # If True + primal cost, remove the rho*KL(...)
    grad_override_hack = options.grad_hack
    display_error = options.display_error

    # Update exponent :
    if rho == inf:  # balanced transport : no mass creation is allowed.
        lam = 1.
    else:  # lam = 1 / (1 + epsilon/rho)
        lam = rho / (rho + epsilon)

    # First, define the transport plan theano "Op" ---------------------------------
    # it takes as input three Theano variables :
    if grad_override_hack:
        mu = T.vector('mu')
        nu = T.vector('nu')
        c = T.matrix('c')
    else:
        mu = Mu
        nu = Nu
        c = C

    # Elementary operations ..................................................
    def ave(u, u1, it):
        """
		Barycenter subroutine, used by kinetic acceleration through extrapolation.
		tau = 0 -> returns u1.
		tau < 0 -> returns an extrapolation coming from u.
		
		Note that doing it on the "exponentiated" variables would not make any sense.
		"""
        t = tau  #t = (1. - 1./((it+2.)**2)) * tau
        return t * u + (1 - t) * u1

    def M(u, v):
        """
		M_ij = (-c_ij + u_i + v_j) / epsilon
		"""
        u_col = u.dimshuffle(
            0, 'x'
        )  # theano syntax to make a vector broadcastable in the 2nd dimension
        v_row = v.dimshuffle(
            'x', 0
        )  # theano syntax to make a vector broadcastable in the 1st dimension
        return (-c + u_col + v_row) / epsilon

    lse = lambda A: T.log(T.sum(T.exp(A), axis=1) + 1e-6
                          )  # slight modif to prevent NaN

    # Actual Sinkhorn loop ..................................................
    # Iteration step :
    def sinkhorn_step(nit, u, v, foo):
        u1 = u  # useful to check the update
        u = ave(u, lam * (epsilon * (T.log(mu) - lse(M(u, v))) + u), nit[0])
        v = ave(v, lam * (epsilon * (T.log(nu) - lse(M(u, v).T)) + v), nit[0])
        if rho == inf:
            err = T.sum(abs(T.sum(T.exp(M(u, v)), 1) - mu))
        else:
            err = T.sum(abs(u - u1))

        return (u, v, err), theano.scan_module.until(
            err < 1e-4)  # "break" the scan loop if error < tol

    # Scan = "For loop" :
    iternumbers = np.arange(niter, dtype=config.floatX)
    iternumbers = stack((iternumbers, iternumbers), 1)
    """
	result, updates = theano.scan_checkpoints(fn            = sinkhorn_step,              # Iterated routine
											  sequences     = [iternumbers],
											  outputs_info  = [(0. * mu), (0. * nu)],     # Starting estimates for [u,v]
											  save_every_N  = niter, padding=False )      # Efficient memory management, at an additional computational cost
	"""
    err0 = np.arange(1, dtype=config.floatX)[0]
    result, updates = theano.scan(
        fn=sinkhorn_step,  # Iterated routine
        sequences=[iternumbers],
        outputs_info=[(0. * mu), (0. * nu),
                      err0]  # Starting estimates for [u,v]
        #n_steps       = niter                       # Number of iterations
    )

    u, v = result[0][-1], result[1][
        -1]  # We only keep the final dual variables
    gamma = T.exp(M(u, v))  # Eventual transport plan g = diag(a)*K*diag(b)

    # Gradient override .....................................................

    if grad_override_hack:  # We give U,V,Gamma, albeit with a "hacked" explicit (i.e. autodiff-free) derivative
        # HERE, WE USE A DEV VERSION which allows :
        # - grad overrides
        # - inlining (for GPU integration)
        # See pull request 5255 on Theano's Github.
        if use_dual_cost:
            hack_derivative = lambda x, g: [0 * x[0], 0 * x[1], 0 * x[2]]
            _transport_plan = OpFromGraph([mu, nu, c], [u, v, gamma],
                                          inline=True,
                                          grad_overrides=hack_derivative)
            U, V, Gamma = _transport_plan(Mu, Nu, C)
        else:
            null_derivative = lambda x, g: [0 * x[0], 0 * x[1], 0 * x[2]]
            _transport_plan = OpFromGraph([mu, nu, c], [gamma],
                                          inline=True,
                                          grad_overrides=null_derivative)
            Gamma = _transport_plan(Mu, Nu, C)
    else:
        U, V, Gamma = u, v, gamma

    # Final cost computation .................................................
    if use_dual_cost:
        """
		print_U  = printing.Print('U  : ', attrs = [ 'shape' ]) ; U  = print_U(U)
		print_Mu = printing.Print('Mu : ', attrs = [ 'shape' ]) ; Mu = print_Mu(Mu)
		print_V  = printing.Print('V  : ', attrs = [ 'shape' ]) ; V  = print_V(V)
		print_Nu = printing.Print('Nu : ', attrs = [ 'shape' ]) ; Nu = print_Nu(Nu)
		print_G  = printing.Print('G  : ', attrs = [ 'shape' ]) ; Gamma = print_G(Gamma)
		"""
        if grad_override_hack:  # allow the first term to have a derivative wrt x

            plan = T.matrix('plan')
            cost_matrix = T.matrix('cost_matrix')
            virtual_cost = T.sum(plan * cost_matrix)
            #hack_derivative = lambda x,g : [ 0 * x[0], T.grad(virtual_cost,
            _firstterm = OpFromGraph([plan, cost_matrix],
                                     [-epsilon * T.sum(plan)],
                                     inline=True,
                                     grad_overrides=hack_derivative)
            cost = _firstterm(Gamma, C)
        else:
            cost = -epsilon * T.sum(Gamma)

        if rho == inf:
            cost += T.sum(Mu * U) + T.sum(Nu * V)
        else:
            cost += - rho * (T.sum( Mu * (T.exp( -U / rho ) - 1) ) \
                  + T.sum( Nu * (T.exp( -V / rho ) - 1) ) )
    else:
        xlogx = lambda x: x * T.log(x + 1e-6)
        xlogy0 = lambda x, y: x * T.log(y + 1e-6)
        H = lambda g: -T.sum(xlogx(g) - g)
        # Primal :
        if discard_entropy:
            cost = T.sum(Gamma * C)
        else:
            cost = T.sum(Gamma * C) - epsilon * H(Gamma)

        KL = lambda h, p: T.sum(xlogy0(h, h / p) - h + p)
        if rho != inf and not discard_KL:
            # We add the KL divergences
            KL_1 = KL(T.sum(Gamma, 1), Mu)
            KL_2 = KL(T.sum(Gamma, 0), Nu)
            cost += rho * (KL_1 + KL_2)

    if display_error:
        print_err_shape = printing.Print('error  : ', attrs=['shape'])
        errors = print_err_shape(result[2])
        print_err = printing.Print('error  : ')
        err_fin = print_err(errors[-1])
        cost += .00000001 * err_fin  # shameful hack to prevent the pruning of the error-printing node...

    return [cost, Gamma]