Пример #1
0
def main(args):
    data = TextData(args.source)
    if args.checkpoint:
        rnn = load_crnn(args.checkpoint)
    else:
        rnn = CharRNN(in_out_size=data.num_classes, state_size=args.state)
    opt = Adagrad(rnn, 0.1, stateful=True, clip=5)

    setup_plot()

    sequence_pairs = list(data.get_seqs(25))
    print('Training on {}:\n'
          '- {} total chars\n'
          '- {} unique chars\n'
          '- {} sequences of length 25'.format(args.source, data.tot_chars,
                                               data.num_classes,
                                               len(sequence_pairs)))
    opt.train(sequence_pairs,
              epochs=40,
              callback=partial(callback, data=data, start=time.time()),
              callback_every=4321,
              epoch_callback=epoch_callback)

    plt.savefig('plots/{}.png'.format(basename(args.source)))
Пример #2
0
    def __init__(self, d, V, r, answer_idxs, embeddings=None, seed=0):
        '''
        d = dimensionality of embeddings
        V = size of vocabulary
        r = number of dependency relations
        answer_idxs = list of indices into the embeddings matrix for all the answers
        embeddings = pre-trained word embeddings
        seed = for random number generator for reproducivility
        '''
        
        self.d = d

        rnge = sqrt(6) / sqrt(201)
        rnge_we = sqrt(6) / sqrt(51)

        np.random.seed(seed)
        
        #|V| x d embedding matrix
        if embeddings is None:
            self.We = theano.shared(name='embeddings',
                                    value=np.random.rand(V, d) * 2 * rnge_we - rnge_we
                                    ).astype(theano.config.floatX)
        else:
            self.We = theano.shared(name='embeddings',
                                    value=embeddings
                                    ).astype(theano.config.floatX)
            
        #r x d x d tensor (matrix for each dependency relation)
        self.Wr = theano.shared(name='dependencies',
                                value=np.random.rand(r, d, d) * 2 * rnge - rnge
                                ).astype(theano.config.floatX)
        
        #d x d map from embedding to hidden vector
        self.Wv = theano.shared(name='Wv',
                                value=np.random.rand(d, d) * 2 * rnge - rnge
                                ).astype(theano.config.floatX)
        
        #d long bias vector
        self.b = theano.shared(name='b',
                               value=np.zeros(d, dtype=theano.config.floatX))
        
        self.params = [self.We, self.Wr, self.Wv, self.b]
        
        self.answer_idxs = np.array(answer_idxs, dtype=np.int32)
        self.ans_probabilities = np.ones(self.answer_idxs.shape[0])/(self.answer_idxs.shape[0]-1)
        self.ans_lookup = {j:i for i,j in enumerate(self.answer_idxs)}
        self._answers = {}
        
        self.descender = Adagrad(self.params)

        def normalized_tanh(x):
            '''returns tanh(x) / ||tanh(x)||'''
            temp = T.tanh(x)
            return T.tanh(x)/ T.sqrt( (temp ** 2).sum() )

        self.f = normalized_tanh


        def helperFunction(x_z, prior_result, x_c, h_n):
            return prior_result + T.maximum(0.0, (1.0 - T.dot(x_c, h_n) ) + T.dot(x_z, h_n))

        #need to calculate both the input to its parent node and the error at this step
        def recurrence(n, hidden_states, hidden_sums, cost, x, r, p, wrong_ans, corr_ans):
            '''
            function called below by scan over the nodes in the dependency parse

            n - this is the index of the current node
            hidden_states - a list of hidden_states for every node, to be updated
            hidden_sums - sum over the children of dot product of the hidden nodes and the relation matrix
            cost - the total cost so far for this tree
            x - a list of word embeddings (x[n] will access the embedding for the current word)
            r - a list of relation matrices (r[n] will access the current matrix)
            p - a list of parent node indices
            wrong_ans - a list of randomly sampled word embeddings for wrong answers
            corr_ans - the word embedding for the correct answer

            You need to calculate 3 things:
            1) The value of hidden_states[n] : h_n = f(W_v \dot x_n + b + sum_n)
            2) The updated value of hidden_sums[p[n]] : hidden_sums[p[n]] + W_r(n) \dot h_n
            3) The updated cost :
            for a single node, this is \sum_{z \in wrong_ans} max(0, 1 - x_c \dot h_n + x_z \dot h_n)

            you need to return the updates to hidden_states, hidden_sums, and cost
            (in that order)
            '''

            h_n = self.f( T.dot(self.Wv, x[n]) + self.b + hidden_sums[n] )
            newStates = T.set_subtensor( hidden_states[n], h_n )

            #newSum = hidden_sums[ p[n] ] + T.dot(self.Wr[n], h_n)
            newSum = T.dot(r[n], h_n)
            #newSum = hidden_sums[ p[n] ] + T.dot(r[n], h_n)
            newSums = T.set_subtensor( hidden_sums[ p[n] ], newSum)
            # update cost
            rr , updates = theano.scan(
                    fn=helperFunction,
                    sequences=wrong_ans,
                    outputs_info=T.as_tensor_variable(np.asarray(0, theano.config.floatX) ),
                    non_sequences=[corr_ans, h_n]
                    )
            final_result = rr[-1] + cost


            return newStates, newSums, final_result

        idxs = T.ivector('idxs')
        x = self.We[idxs]

        rel_idxs = T.ivector('rel_idxs')
        r = self.Wr[rel_idxs]

        p = T.ivector('parents')

        wrong_idxs = T.ivector('wrong_idxs')
        wrong_ans = self.We[wrong_idxs]

        corr_idx = T.iscalar('corr_idx') # index of answer
        corr_ans = self.We[corr_idx]

        hidden_states = T.zeros((idxs.shape[0], d), dtype=theano.config.floatX)
        #needs to be sent_length + 1 to store final sum
        hidden_sums = T.zeros((idxs.shape[0]+1, d), dtype=theano.config.floatX)

        [h, s, cost], updates = theano.scan(fn=recurrence,
                                            sequences=T.arange(x.shape[0]),
                                            outputs_info=[hidden_states,
                                                          hidden_sums,
                                                          T.as_tensor_variable(np.asarray(0, theano.config.floatX))],
                                            non_sequences=[x, r, p, wrong_ans, corr_ans])
        final_states = h[-1]
        self.states = theano.function(inputs=[idxs, rel_idxs, p, wrong_idxs, corr_idx], outputs=final_states)

        final_cost = cost[-1] #no regularization
        gradients = T.grad(final_cost, self.params)
        self.cost_and_grad = theano.function(inputs=[idxs, rel_idxs, p, wrong_idxs, corr_idx], outputs=[final_cost] + gradients)
Пример #3
0
class DependencyRNN:
    '''
    class for dependency RNN for QANTA
    '''
    def __init__(self, d, V, r, answer_idxs, embeddings=None, seed=0):
        '''
        d = dimensionality of embeddings
        V = size of vocabulary
        r = number of dependency relations
        answer_idxs = list of indices into the embeddings matrix for all the answers
        embeddings = pre-trained word embeddings
        seed = for random number generator for reproducivility
        '''
        
        self.d = d

        rnge = sqrt(6) / sqrt(201)
        rnge_we = sqrt(6) / sqrt(51)

        np.random.seed(seed)
        
        #|V| x d embedding matrix
        if embeddings is None:
            self.We = theano.shared(name='embeddings',
                                    value=np.random.rand(V, d) * 2 * rnge_we - rnge_we
                                    ).astype(theano.config.floatX)
        else:
            self.We = theano.shared(name='embeddings',
                                    value=embeddings
                                    ).astype(theano.config.floatX)
            
        #r x d x d tensor (matrix for each dependency relation)
        self.Wr = theano.shared(name='dependencies',
                                value=np.random.rand(r, d, d) * 2 * rnge - rnge
                                ).astype(theano.config.floatX)
        
        #d x d map from embedding to hidden vector
        self.Wv = theano.shared(name='Wv',
                                value=np.random.rand(d, d) * 2 * rnge - rnge
                                ).astype(theano.config.floatX)
        
        #d long bias vector
        self.b = theano.shared(name='b',
                               value=np.zeros(d, dtype=theano.config.floatX))
        
        self.params = [self.We, self.Wr, self.Wv, self.b]
        
        self.answer_idxs = np.array(answer_idxs, dtype=np.int32)
        self.ans_probabilities = np.ones(self.answer_idxs.shape[0])/(self.answer_idxs.shape[0]-1)
        self.ans_lookup = {j:i for i,j in enumerate(self.answer_idxs)}
        self._answers = {}
        
        self.descender = Adagrad(self.params)

        def normalized_tanh(x):
            '''returns tanh(x) / ||tanh(x)||'''
            temp = T.tanh(x)
            return T.tanh(x)/ T.sqrt( (temp ** 2).sum() )

        self.f = normalized_tanh


        def helperFunction(x_z, prior_result, x_c, h_n):
            return prior_result + T.maximum(0.0, (1.0 - T.dot(x_c, h_n) ) + T.dot(x_z, h_n))

        #need to calculate both the input to its parent node and the error at this step
        def recurrence(n, hidden_states, hidden_sums, cost, x, r, p, wrong_ans, corr_ans):
            '''
            function called below by scan over the nodes in the dependency parse

            n - this is the index of the current node
            hidden_states - a list of hidden_states for every node, to be updated
            hidden_sums - sum over the children of dot product of the hidden nodes and the relation matrix
            cost - the total cost so far for this tree
            x - a list of word embeddings (x[n] will access the embedding for the current word)
            r - a list of relation matrices (r[n] will access the current matrix)
            p - a list of parent node indices
            wrong_ans - a list of randomly sampled word embeddings for wrong answers
            corr_ans - the word embedding for the correct answer

            You need to calculate 3 things:
            1) The value of hidden_states[n] : h_n = f(W_v \dot x_n + b + sum_n)
            2) The updated value of hidden_sums[p[n]] : hidden_sums[p[n]] + W_r(n) \dot h_n
            3) The updated cost :
            for a single node, this is \sum_{z \in wrong_ans} max(0, 1 - x_c \dot h_n + x_z \dot h_n)

            you need to return the updates to hidden_states, hidden_sums, and cost
            (in that order)
            '''

            h_n = self.f( T.dot(self.Wv, x[n]) + self.b + hidden_sums[n] )
            newStates = T.set_subtensor( hidden_states[n], h_n )

            #newSum = hidden_sums[ p[n] ] + T.dot(self.Wr[n], h_n)
            newSum = T.dot(r[n], h_n)
            #newSum = hidden_sums[ p[n] ] + T.dot(r[n], h_n)
            newSums = T.set_subtensor( hidden_sums[ p[n] ], newSum)
            # update cost
            rr , updates = theano.scan(
                    fn=helperFunction,
                    sequences=wrong_ans,
                    outputs_info=T.as_tensor_variable(np.asarray(0, theano.config.floatX) ),
                    non_sequences=[corr_ans, h_n]
                    )
            final_result = rr[-1] + cost


            return newStates, newSums, final_result

        idxs = T.ivector('idxs')
        x = self.We[idxs]

        rel_idxs = T.ivector('rel_idxs')
        r = self.Wr[rel_idxs]

        p = T.ivector('parents')

        wrong_idxs = T.ivector('wrong_idxs')
        wrong_ans = self.We[wrong_idxs]

        corr_idx = T.iscalar('corr_idx') # index of answer
        corr_ans = self.We[corr_idx]

        hidden_states = T.zeros((idxs.shape[0], d), dtype=theano.config.floatX)
        #needs to be sent_length + 1 to store final sum
        hidden_sums = T.zeros((idxs.shape[0]+1, d), dtype=theano.config.floatX)

        [h, s, cost], updates = theano.scan(fn=recurrence,
                                            sequences=T.arange(x.shape[0]),
                                            outputs_info=[hidden_states,
                                                          hidden_sums,
                                                          T.as_tensor_variable(np.asarray(0, theano.config.floatX))],
                                            non_sequences=[x, r, p, wrong_ans, corr_ans])
        final_states = h[-1]
        self.states = theano.function(inputs=[idxs, rel_idxs, p, wrong_idxs, corr_idx], outputs=final_states)

        final_cost = cost[-1] #no regularization
        gradients = T.grad(final_cost, self.params)
        self.cost_and_grad = theano.function(inputs=[idxs, rel_idxs, p, wrong_idxs, corr_idx], outputs=[final_cost] + gradients)

    def gradient_descent(self, new_gradients):
        self.descender.gradient_descent(*new_gradients)

    #batch consists of tuples of word indices, relation indices, parent indices, and an answer index
    def train(self, batch, num_wrong_ans=100):
        total_cost_and_grad = None
        total_nodes = 0.

        #split data into batches, then into minibatches for multiprocessing

        for datum in batch:
            idxs, rel_idxs, p, corr_idx = datum

            #sample new wrong answers for every point (make sure not to sample the correct answer)
            self.ans_probabilities[self.ans_lookup[corr_idx]] = 0
            wrong_idxs = self.answer_idxs[np.random.choice(self.answer_idxs.shape[0],
                                                           num_wrong_ans,
                                                           False,
                                                           self.ans_probabilities)]
            self.ans_probabilities[self.ans_lookup[corr_idx]] = 1./(self.ans_probabilities.shape[0]-1)

            cost_and_grad = self.cost_and_grad(idxs, rel_idxs, p, wrong_idxs, corr_idx)
            if total_cost_and_grad is None:
                total_cost_and_grad = [0] + [np.zeros(i.shape) for i in cost_and_grad[1:]]
            for i in range(len(cost_and_grad)):
                total_cost_and_grad[i] += cost_and_grad[i]
            total_nodes += len(idxs)

        #update gradients from total_cost_and_grad[1:]
        self.gradient_descent([i/total_nodes for i in total_cost_and_grad[1:]])

        #print total_nodes, total_cost_and_grad[0]/total_nodes, total_cost_and_grad[0], total_nodes
        return total_cost_and_grad[0]/total_nodes

    def reset_weights(self):
        self.descender.reset_weights()

    def transform(self, batch, stop_indices=None):
        features = []
        for idxs,rel_idxs,p in batch:
            h = self.states(idxs, rel_idxs, p, [], 0)
            x = np.zeros(self.d)
            count = 0.0
            for i,s in enumerate(h):
                if stop_indices is None or idxs[i] not in stop_indices:
                    x += s
                    count += 1
            features.append(x / count)
            
        return(np.array(features))

    def save(self, filename, answers):
        '''save all the weights and hyperparameters to a file'''
        kwds = {}
        for param in self.params:
            kwds[param.name] = param.get_value()
        kwds['answer_idxs'] = self.answer_idxs
        
        with open(filename, 'wb') as f:
            np.savez(f, **kwds)

        embeddings = self.We.get_value()
        for answer in answers:
            self._answers[answer] = embeddings[answers[answer]].tolist()

        with open(filename + '.json', 'w') as f:
            json.dump(self._answers, f)

    @classmethod
    def load(cls, filename):
        '''load pre-trained weights from a file'''
        with open(filename) as f:
            npzfile = np.load(f)
            
            d = npzfile['embeddings'].shape[1]
            V = npzfile['embeddings'].shape[0]
            r = npzfile['dependencies'].shape[0]

            d = cls(d, V, r, npzfile['answer_idxs'])

            for param in d.params:
                param.set_value(npzfile[param.name])

        with open(filename + '.json') as f:
            d._answers = json.load(f)

        return d

    @property
    def answers(self):
        return self._answers
Пример #4
0
model.add(Dense(1, activation='sigmoid'))

plot_model(model, to_file='model_imdb.png', show_shapes=True)

results_acc = []
result_acc = []
results_loss = []
result_loss = []
test_acc_results = []
test_loss_results = []
l = [
    Adam(lr=0.001, amsgrad=True),
    AAdam(lr=0.001, amsgrad=True),
    Adam(lr=0.001, amsgrad=False),
    AAdam(lr=0.001, amsgrad=False),
    Adagrad(),
    AAdagrad(),
    SGD(),
    ASGD()
]  #, Adam(lr=0.001, amsgrad = True), AAdam(lr=0.001, amsgrad = True)]

for opt in l:

    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    #model.save_weights('initial_weights_imdb.h5')
    model.load_weights('initial_weights_imdb.h5')
    initial_weights = model.get_weights()
    result_acc = []
    result_loss = []
Пример #5
0
    def __init__(self, d, V, r, answer_idxs, embeddings=None, seed=0):
        '''
        d = dimensionality of embeddings
        V = size of vocabulary
        r = number of dependency relations
        answer_idxs = list of indices into the embeddings matrix for all the answers
        embeddings = pre-trained word embeddings
        seed = for random number generator for reproducivility
        '''
        
        self.d = d

        rnge = sqrt(6) / sqrt(201)
        rnge_we = sqrt(6) / sqrt(51)

        np.random.seed(seed)
        
        #|V| x d embedding matrix
        if embeddings is None:
            self.We = theano.shared(name='embeddings',
                                    value=np.random.rand(V, d) * 2 * rnge_we - rnge_we
                                    ).astype(theano.config.floatX)
        else:
            self.We = theano.shared(name='embeddings',
                                    value=embeddings
                                    ).astype(theano.config.floatX)
            
        #r x d x d tensor (matrix for each dependency relation)
        self.Wr = theano.shared(name='dependencies',
                                value=np.random.rand(r, d, d) * 2 * rnge - rnge
                                ).astype(theano.config.floatX)
        
        #d x d map from embedding to hidden vector
        self.Wv = theano.shared(name='Wv',
                                value=np.random.rand(d, d) * 2 * rnge - rnge
                                ).astype(theano.config.floatX)
        
        #d long bias vector
        self.b = theano.shared(name='b',
                               value=np.zeros(d, dtype=theano.config.floatX))
        
        self.params =  [self.We, self.Wr, self.Wv, self.b]
        
        self.answer_idxs = np.array(answer_idxs, dtype=np.int32)
        self.ans_probabilities = np.ones(self.answer_idxs.shape[0])/(self.answer_idxs.shape[0]-1)
        self.ans_lookup = {j:i for i,j in enumerate(self.answer_idxs)}
        self._answers = {}
        
        self.descender = Adagrad(self.params)

        def normalized_tanh(x):
            '''returns tanh(x) / ||tanh(x)||'''
            return T.tanh(x)/T.sqrt(T.sum(T.tanh(x)**2))
            
        self.f = normalized_tanh

        #need to calculate both the input to its parent node and the error at this step
        def recurrence(n, hidden_states, hidden_sums, cost, x, r, p, wrong_ans, corr_ans):
            '''
            function called below by scan over the nodes in the dependency parse
            
            n - this is the index of the current node
            hidden_states - a list of hidden_states for every node, to be updated
            hidden_sums - sum over the children of dot product of the hidden nodes and the relation matrix
            cost - the total cost so far for this tree
            x - a list of word embeddings (x[n] will access the embedding for the current word)
            r - a list of relation matrices (r[n] will access the current matrix)
            p - a list of parent node indices
            wrong_ans - a list of randomly sampled word embeddings for wrong answers
            corr_ans - the word embedding for the correct answer
            '''
            h_n = self.f(T.dot(self.Wv, x[n]) + self.b + hidden_sums[n])

            update_new = T.set_subtensor(hidden_sums[p[n]], hidden_sums[p[n]]+ T.dot(r[n], h_n))

            update_hidden_new = T.set_subtensor(hidden_states[n], h_n)

            output, updates = theano.scan(fn = lambda xz, xc, h_n: T.maximum(0., 1.-T.dot(xc, h_n) + T.dot(xz, h_n)),
                                         outputs_info = None, sequences = wrong_ans, non_sequences = [corr_ans, h_n])

            output_sum = T.as_tensor_variable(output.sum())

            new_cost = cost + output_sum

            return update_hidden_new, update_new, new_cost

        idxs = T.ivector('idxs')
        x = self.We[idxs]

        rel_idxs = T.ivector('rel_idxs')
        r = self.Wr[rel_idxs]

        p = T.ivector('parents')

        wrong_idxs = T.ivector('wrong_idxs')
        wrong_ans = self.We[wrong_idxs]

        corr_idx = T.iscalar('corr_idx') # index of answer
        corr_ans = self.We[corr_idx]

        hidden_states = T.zeros((idxs.shape[0], d), dtype=theano.config.floatX)
        #needs to be sent_length + 1 to store final sum
        hidden_sums = T.zeros((idxs.shape[0]+1, d), dtype=theano.config.floatX)

        [h, s, cost], updates = theano.scan(fn=recurrence,
                                            sequences=T.arange(x.shape[0]),
                                            outputs_info=[hidden_states,
                                                          hidden_sums,
                                                          T.as_tensor_variable(np.asarray(0, theano.config.floatX))],
                                            non_sequences=[x, r, p, wrong_ans, corr_ans])
        final_states = h[-1]
        self.states = theano.function(inputs=[idxs, rel_idxs, p, wrong_idxs, corr_idx], outputs=final_states)

        final_cost = cost[-1] #no regularization
        gradients = T.grad(final_cost, self.params)
        self.cost_and_grad = theano.function(inputs=[idxs, rel_idxs, p, wrong_idxs, corr_idx], outputs=[final_cost] + gradients)
Пример #6
0
class DependencyRNN:
    '''
    class for dependency RNN for QANTA
    '''
    def __init__(self, d, V, r, answer_idxs, embeddings=None, seed=0):
        '''
        d = dimensionality of embeddings
        V = size of vocabulary
        r = number of dependency relations
        answer_idxs = list of indices into the embeddings matrix for all the answers
        embeddings = pre-trained word embeddings
        seed = for random number generator for reproducivility
        '''
        
        self.d = d

        rnge = sqrt(6) / sqrt(201)
        rnge_we = sqrt(6) / sqrt(51)

        np.random.seed(seed)
        
        #|V| x d embedding matrix
        if embeddings is None:
            self.We = theano.shared(name='embeddings',
                                    value=np.random.rand(V, d) * 2 * rnge_we - rnge_we
                                    ).astype(theano.config.floatX)
        else:
            self.We = theano.shared(name='embeddings',
                                    value=embeddings
                                    ).astype(theano.config.floatX)
            
        #r x d x d tensor (matrix for each dependency relation)
        self.Wr = theano.shared(name='dependencies',
                                value=np.random.rand(r, d, d) * 2 * rnge - rnge
                                ).astype(theano.config.floatX)
        
        #d x d map from embedding to hidden vector
        self.Wv = theano.shared(name='Wv',
                                value=np.random.rand(d, d) * 2 * rnge - rnge
                                ).astype(theano.config.floatX)
        
        #d long bias vector
        self.b = theano.shared(name='b',
                               value=np.zeros(d, dtype=theano.config.floatX))
        
        self.params =  [self.We, self.Wr, self.Wv, self.b]
        
        self.answer_idxs = np.array(answer_idxs, dtype=np.int32)
        self.ans_probabilities = np.ones(self.answer_idxs.shape[0])/(self.answer_idxs.shape[0]-1)
        self.ans_lookup = {j:i for i,j in enumerate(self.answer_idxs)}
        self._answers = {}
        
        self.descender = Adagrad(self.params)

        def normalized_tanh(x):
            '''returns tanh(x) / ||tanh(x)||'''
            return T.tanh(x)/T.sqrt(T.sum(T.tanh(x)**2))
            
        self.f = normalized_tanh

        #need to calculate both the input to its parent node and the error at this step
        def recurrence(n, hidden_states, hidden_sums, cost, x, r, p, wrong_ans, corr_ans):
            '''
            function called below by scan over the nodes in the dependency parse
            
            n - this is the index of the current node
            hidden_states - a list of hidden_states for every node, to be updated
            hidden_sums - sum over the children of dot product of the hidden nodes and the relation matrix
            cost - the total cost so far for this tree
            x - a list of word embeddings (x[n] will access the embedding for the current word)
            r - a list of relation matrices (r[n] will access the current matrix)
            p - a list of parent node indices
            wrong_ans - a list of randomly sampled word embeddings for wrong answers
            corr_ans - the word embedding for the correct answer
            '''
            h_n = self.f(T.dot(self.Wv, x[n]) + self.b + hidden_sums[n])

            update_new = T.set_subtensor(hidden_sums[p[n]], hidden_sums[p[n]]+ T.dot(r[n], h_n))

            update_hidden_new = T.set_subtensor(hidden_states[n], h_n)

            output, updates = theano.scan(fn = lambda xz, xc, h_n: T.maximum(0., 1.-T.dot(xc, h_n) + T.dot(xz, h_n)),
                                         outputs_info = None, sequences = wrong_ans, non_sequences = [corr_ans, h_n])

            output_sum = T.as_tensor_variable(output.sum())

            new_cost = cost + output_sum

            return update_hidden_new, update_new, new_cost

        idxs = T.ivector('idxs')
        x = self.We[idxs]

        rel_idxs = T.ivector('rel_idxs')
        r = self.Wr[rel_idxs]

        p = T.ivector('parents')

        wrong_idxs = T.ivector('wrong_idxs')
        wrong_ans = self.We[wrong_idxs]

        corr_idx = T.iscalar('corr_idx') # index of answer
        corr_ans = self.We[corr_idx]

        hidden_states = T.zeros((idxs.shape[0], d), dtype=theano.config.floatX)
        #needs to be sent_length + 1 to store final sum
        hidden_sums = T.zeros((idxs.shape[0]+1, d), dtype=theano.config.floatX)

        [h, s, cost], updates = theano.scan(fn=recurrence,
                                            sequences=T.arange(x.shape[0]),
                                            outputs_info=[hidden_states,
                                                          hidden_sums,
                                                          T.as_tensor_variable(np.asarray(0, theano.config.floatX))],
                                            non_sequences=[x, r, p, wrong_ans, corr_ans])
        final_states = h[-1]
        self.states = theano.function(inputs=[idxs, rel_idxs, p, wrong_idxs, corr_idx], outputs=final_states)

        final_cost = cost[-1] #no regularization
        gradients = T.grad(final_cost, self.params)
        self.cost_and_grad = theano.function(inputs=[idxs, rel_idxs, p, wrong_idxs, corr_idx], outputs=[final_cost] + gradients)

    def gradient_descent(self, new_gradients):
        self.descender.gradient_descent(*new_gradients)

    #batch consists of tuples of word indices, relation indices, parent indices, and an answer index
    def train(self, batch, num_wrong_ans=100):
        total_cost_and_grad = None
        total_nodes = 0.

        #split data into batches, then into minibatches for multiprocessing

        for datum in batch:
            idxs, rel_idxs, p, corr_idx = datum

            #sample new wrong answers for every point (make sure not to sample the correct answer)
            self.ans_probabilities[self.ans_lookup[corr_idx]] = 0
            wrong_idxs = self.answer_idxs[np.random.choice(self.answer_idxs.shape[0],
                                                           num_wrong_ans,
                                                           False,
                                                           self.ans_probabilities)]
            self.ans_probabilities[self.ans_lookup[corr_idx]] = 1./(self.ans_probabilities.shape[0]-1)

            cost_and_grad = self.cost_and_grad(idxs, rel_idxs, p, wrong_idxs, corr_idx)
            if total_cost_and_grad is None:
                total_cost_and_grad = [0] + [np.zeros(i.shape) for i in cost_and_grad[1:]]
            for i in range(len(cost_and_grad)):
                total_cost_and_grad[i] += cost_and_grad[i]
            total_nodes += len(idxs)

        #update gradients from total_cost_and_grad[1:]
        self.gradient_descent([i/total_nodes for i in total_cost_and_grad[1:]])

        return total_cost_and_grad[0]/total_nodes

    def reset_weights(self):
        self.descender.reset_weights()

    def transform(self, batch, stop_indices=None):
        features = []
        for idxs,rel_idxs,p in batch:
            h = self.states(idxs, rel_idxs, p, [], 0)
            x = np.zeros(self.d)
            count = 0.0
            for i,s in enumerate(h):
                if stop_indices is None or idxs[i] not in stop_indices:
                    x += s
                    count += 1
            features.append(x / count)
            
        return(np.array(features))

    def save(self, filename, answers):
        '''save all the weights and hyperparameters to a file'''
        kwds = {}
        for param in self.params:
            kwds[param.name] = param.get_value()
        kwds['answer_idxs'] = self.answer_idxs
        
        with open(filename, 'wb') as f:
            np.savez(f, **kwds)

        embeddings = self.We.get_value()
        for answer in answers:
            self._answers[answer] = embeddings[answers[answer]].tolist()

        with open(filename + '.json', 'w') as f:
            json.dump(self._answers, f)

    @classmethod
    def load(cls, filename):
        '''load pre-trained weights from a file'''
        with open(filename) as f:
            npzfile = np.load(f)
            
            d = npzfile['embeddings'].shape[1]
            V = npzfile['embeddings'].shape[0]
            r = npzfile['dependencies'].shape[0]

            d = cls(d, V, r, npzfile['answer_idxs'])

            for param in d.params:
                param.set_value(npzfile[param.name])

        with open(filename + '.json') as f:
            d._answers = json.load(f)

        return d

    @property
    def answers(self):
        return self._answers
Пример #7
0
network.add(layers.Dense(512, activation='relu', input_shape=(28 * 28, )))
network.add(layers.Dense(10, activation='softmax'))

results_acc = []
result_acc = []
results_loss = []
result_loss = []
test_acc_results = []
test_loss_results = []
nonzero_weights = []

l = [
    GRDA(lr=.005, c=.02),
    SGD(lr=.005, nesterov=False),
    SGD(lr=.005, nesterov=True),
    Adagrad(lr=.005),
    Adam(lr=.005, amsgrad=False),
    Adam(lr=.005, amsgrad=True)
]
allcounts = np.sum([x.size for x in network.get_weights()])

for opt in l:
    network.compile(optimizer=opt,
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])
    #network.save_weights('initial_weights.h5')
    #network.load_weights('initial_weights.h5')
    #initial_weights = network.get_weights()
    result_acc = []
    result_loss = []
    test_loss = []
Пример #8
0
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                       args.nlayers, args.dropout, args.tied,
                       not args.dense).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = None
if args.opt == 'Adam':
    optimizer = Adam_Base(model.parameters(), betas=(0.9, 0.999))
elif args.opt == 'SGD':
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
elif args.opt == 'Adagrad':
    optimizer = torch.optim.Adagrad(model.parameters(), args.lr)
elif args.opt == 'SGD_CS':
    optimizer = SGD(model.parameters(), args.lr, momentum=0.9, nesterov=True)
elif args.opt == 'Adagrad_CS':
    optimizer = Adagrad(model.parameters(), args.lr)
elif args.opt == 'Adam_CS':
    optimizer = Adam(model.parameters(), betas=(0.9, 0.999))
elif args.opt == 'RMSprop_CS':
    optimizer = RMSprop(model.parameters())
elif args.opt == 'Adam_MNK':
    optimizer = Adam_MNK(model.parameters(), betas=(0.9, 0.999))
elif args.opt == 'Adam_MNK_Embed':
    optimizer = Adam_MNK_Embed(model.parameters(), betas=(0.9, 0.999))
elif args.opt == 'Adagrad_SM3':
    optimizer = Adagrad_SM3(model.parameters(), args.lr)
elif args.opt == 'Adagrad_SM3_II_Embed':
    optimizer = Adagrad_SM3_II_Embed(model.parameters(), args.lr)
else:
    assert args.opt == 'Adagrad_SM3_II'
    optimizer = Adagrad_SM3_II(model.parameters(), args.lr)
Пример #9
0
    def __init__(self, d, V, r, answer_idxs, embeddings=None, seed=0):
        '''
        d = dimensionality of embeddings
        V = size of vocabulary
        r = number of dependency relations
        answer_idxs = list of indices into the embeddings matrix for all the answers
        embeddings = pre-trained word embeddings
        seed = for random number generator for reproducivility
        '''

        self.d = d

        rnge = sqrt(6) / sqrt(201)
        rnge_we = sqrt(6) / sqrt(51)

        np.random.seed(seed)

        #|V| x d embedding matrix
        if embeddings is None:
            self.We = theano.shared(name='embeddings',
                                    value=np.random.rand(V, d) * 2 * rnge_we -
                                    rnge_we).astype(theano.config.floatX)
        else:
            self.We = theano.shared(name='embeddings',
                                    value=embeddings).astype(
                                        theano.config.floatX)

        #r x d x d tensor (matrix for each dependency relation)
        self.Wr = theano.shared(name='dependencies',
                                value=np.random.rand(r, d, d) * 2 * rnge -
                                rnge).astype(theano.config.floatX)

        #d x d map from embedding to hidden vector
        self.Wv = theano.shared(name='Wv',
                                value=np.random.rand(d, d) * 2 * rnge -
                                rnge).astype(theano.config.floatX)

        #d long bias vector
        self.b = theano.shared(name='b',
                               value=np.zeros(d, dtype=theano.config.floatX))

        self.params = [self.Wr, self.Wv, self.b]

        self.answer_idxs = np.array(answer_idxs, dtype=np.int32)
        self.ans_probabilities = np.ones(
            self.answer_idxs.shape[0]) / (self.answer_idxs.shape[0] - 1)
        self.ans_lookup = {j: i for i, j in enumerate(self.answer_idxs)}
        self._answers = {}

        self.descender = Adagrad(self.params)

        def normalized_tanh(x):
            '''returns tanh(x) / ||tanh(x)||'''
            tanh = T.tanh(x)
            #norm_tanh = tanh/(T.sqrt((T.dot(tanh, tanh)).sum()))
            sumt = (tanh**2).sum()
            norm_tanh = tanh / (T.sqrt(sumt))
            #raise NotImplementedError
            return norm_tanh

        self.f = normalized_tanh

        #need to calculate both the input to its parent node and the error at this step
        def recurrence(n, hidden_states, hidden_sums, cost, x, r, p, wrong_ans,
                       corr_ans):
            '''
            function called below by scan over the nodes in the dependency parse
            
            n - this is the index of the current node
            hidden_states - a list of hidden_states for every node, to be updated
            hidden_sums - sum over the children of dot product of the hidden nodes and the relation matrix
            cost - the total cost so far for this tree
            x - a list of word embeddings (x[n] will access the embedding for the current word)
            r - a list of relation matrices (r[n] will access the current matrix)
            p - a list of parent node indices
            wrong_ans - a list of randomly sampled word embeddings for wrong answers
            corr_ans - the word embedding for the correct answer

            You need to calculate 3 things:
            1) The value of hidden_states[n] : h_n = f(W_v \dot x_n + b + sum_n)
            2) The updated value of hidden_sums[p[n]] : hidden_sums[p[n]] + W_r(n) \dot h_n
            3) The updated cost :
            for a single node, this is \sum_{z \in wrong_ans} max(0, 1 - x_c \dot h_n + x_z \dot h_n)
            
            you need to return the updates to hidden_states, hidden_sums, and cost
            (in that order)
            '''
            #raise NotImplementedError

            #hidden_states[n]:
            #h_n = T.set_subtensor(hidden_states[n], self.f(T.dot(self.Wv, x[n]) + self.b + hidden_sums[n]))
            h_n = self.f(T.dot(self.Wv, x[n]) + self.b + hidden_sums[n])

            #updated value of hidden_sums[p[n]]:
            h_p_n = T.set_subtensor(hidden_sums[p[n]],
                                    hidden_sums[p[n]] + T.dot(r[n], h_n))

            #updated cost:
            #cost = T.maximum(0, 1- T.dot(corr_ans, h_n) + T.dot(wrong_ans[z], h_n)
            def costupdate(z, prev, base, h_n):
                return prev + T.maximum(0, base + T.dot(z, h_n))

            base = 1 - T.dot(corr_ans, h_n)
            result, s_updates = theano.scan(fn=costupdate,
                                            sequences=wrong_ans,
                                            outputs_info=T.as_tensor_variable(
                                                np.asarray(
                                                    0, theano.config.floatX)),
                                            non_sequences=[base, h_n])
            x = result[-1] + cost

            return (T.set_subtensor(hidden_states[n], h_n), h_p_n, x)

        idxs = T.ivector('idxs')
        x = self.We[idxs]

        rel_idxs = T.ivector('rel_idxs')
        r = self.Wr[rel_idxs]

        p = T.ivector('parents')

        wrong_idxs = T.ivector('wrong_idxs')
        wrong_ans = self.We[wrong_idxs]

        corr_idx = T.iscalar('corr_idx')  # index of answer
        corr_ans = self.We[corr_idx]

        hidden_states = T.zeros((idxs.shape[0], d), dtype=theano.config.floatX)

        #needs to be sent_length + 1 to store final sum
        hidden_sums = T.zeros((idxs.shape[0] + 1, d),
                              dtype=theano.config.floatX)

        [h, s, cost], updates = theano.scan(
            fn=recurrence,
            sequences=T.arange(x.shape[0]),
            outputs_info=[
                hidden_states, hidden_sums,
                T.as_tensor_variable(np.asarray(0, theano.config.floatX))
            ],
            non_sequences=[x, r, p, wrong_ans, corr_ans])
        final_states = h[-1]
        self.states = theano.function(
            inputs=[idxs, rel_idxs, p, wrong_idxs, corr_idx],
            outputs=final_states)

        final_cost = cost[-1]  #no regularization
        gradients = T.grad(final_cost, self.params)
        self.cost_and_grad = theano.function(
            inputs=[idxs, rel_idxs, p, wrong_idxs, corr_idx],
            outputs=[final_cost] + gradients)