def adadelta(self, lr, tparams, grads, model_input, cost, givens=None): """ An adaptive learning rate optimizer Parameters ---------- lr : Theano SharedVariable Initial learning rate tpramas: Theano SharedVariable Model parameters grads: Theano variable Gradients of cost w.r.t to parameres input: Theano variable of input, list. cost: Theano variable Objective fucntion to minimize Notes ----- For more information, see [ADADELTA]_. .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning Rate Method*, arXiv:1212.5701. """ zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(model_input, cost, updates=zgup + rg2up, name='adadelta_f_grad_shared', givens=givens) updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update', givens=givens) return f_grad_shared, f_update
def adadelta(self, lr, tparams, grads, model_input, cost, givens=None): """ An adaptive learning rate optimizer Parameters ---------- lr : tensorheano SharedVariable Initial learning rate tpramas: tensorheano SharedVariable Model parameters grads: tensorheano variable Gradients of cost w.r.t to parameres input: tensorheano variable of input, list. cost: tensorheano variable Objective fucntion to minimize Notes ----- For more information, see [ADADELtensorA]_. .. [ADADELtensorA] Matthew D. Zeiler, *ADADELtensorA: An Adaptive Learning Rate Method*, arXiv:1212.5701. """ zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(model_input, cost, updates=zgup + rg2up, name='adadelta_f_grad_shared', givens=givens) updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update', givens=givens) return f_grad_shared, f_update
def getOutput(self, inputs): """ Get outputs of encoder layer. Return all of the hidden status. """ (self.sentence, self.mask) = inputs assert self.mask is not None n_steps = self.sentence.shape[0] if self.sentence.ndim == 3: n_samples = self.sentence.shape[1] else: n_samples = 1 last_h = tensor.alloc(numpy_floatX(0.), n_samples, self.hidden_status_dim) state_below = tensor.dot(self.sentence, self.node.get_params_W()) results, _ = theano.scan(self.node.node_update, sequences=[self.mask, state_below], outputs_info=[last_h], name=self._p(self.prefix, '_scan'), n_steps=n_steps) hidden_status_outputs = results return hidden_status_outputs
def get_output(self, inputs): """ Get outputs of encoder layer. Return all of the hidden status. """ (self.sentence, self.mask, self.encoder_hidden_status, self.question_mask) = inputs assert self.mask is not None n_steps = self.sentence.shape[0] if self.sentence.ndim == 3: n_samples = self.sentence.shape[1] else: n_samples = 1 # last_s = tensor.dot(self.encoder_hidden_status[0, :, self.hidden_status_dim:], # self.params[self._p(self.prefix, 'Ws')]) last_s = tensor.alloc(numpy_floatX(0.), n_samples, self.hidden_status_dim) state_below = self.sentence def upd(am_, x_, s_, h_, qm_): c, alpha = self.attention_node.node_update(s_, h_, qm_) x_ = tensor.dot(tensor.concatenate([x_, c], axis=1), self.node.get_params_W()) s = self.node.node_update(am_, x_, s_) return s, c, alpha results, _ = theano.scan(upd, sequences=[self.mask, state_below], outputs_info=[last_s, None, None], non_sequences=[self.encoder_hidden_status, self.question_mask], name=self._p(self.prefix, '_scan'), n_steps=n_steps) hidden_status_outputs, context_outputs, alpha_outputs = results return hidden_status_outputs, context_outputs, alpha_outputs
def get_output(self, inputs): """ Get outputs of encoder layer. Return all of the hidden status. """ (self.sentence, self.mask) = inputs assert self.mask is not None n_steps = self.sentence.shape[0] if self.sentence.ndim == 3: n_samples = self.sentence.shape[1] else: n_samples = 1 hidden_states_list = [self.sentence] for idx in range(self.n_layers) : sentence = hidden_states_list[idx] state_below = tensor.dot(sentence, self.node_list[idx].get_params_W()) last_h = tensor.alloc(numpy_floatX(0.), n_samples, self.hidden_status_dim) results, _ = theano.scan(self.node_list[idx].node_update, sequences=[self.mask, state_below], outputs_info=[last_h], name=self._p(self.prefix, '_scan'+str(idx)), n_steps=n_steps) hidden_states_list.append(results) hidden_status_outputs = hidden_states_list return hidden_status_outputs
def get_output(self, inputs): """ Get outputs of encoder layer. Return all of the hidden status. """ (self.sentence, self.mask, self.question_children, self.question_children_mask, self.max_offset) = inputs assert self.mask is not None n_steps = self.sentence.shape[0] if self.sentence.ndim == 3: n_samples = self.sentence.shape[1] else: n_samples = 1 queue_buffer = tensor.alloc(numpy_floatX(0.), n_samples, self.max_offset, self.hidden_status_dim) state_below = tensor.dot(self.sentence, self.node.get_params_W()) non_seq = self.node.get_non_seq_parameter(n_samples) self.question_children_mask= self.question_children_mask.dimshuffle([0, 1, 2, 'x']) results, _ = theano.scan(self.node.node_update, sequences=[self.mask, state_below, self.question_children, self.question_children_mask], outputs_info=[queue_buffer, queue_buffer[:, -1, :]], non_sequences=non_seq, name=self._p(self.prefix, '_scan'), n_steps=n_steps) hidden_status_outputs = results[1] return hidden_status_outputs
def get_output(self, inputs): """ Get outputs of encoder layer. Return all of the hidden status. """ if len(inputs) == 4: (self.sentence, self.mask, self.forward_hidden_status, self.direction) = inputs if len(inputs) == 5: (self.sentence, self.mask, self.forward_hidden_status, self.direction, _) = inputs assert self.mask is not None n_steps = self.sentence.shape[0] if self.sentence.ndim == 3: n_samples = self.sentence.shape[1] else: n_samples = 1 if len(inputs) == 4: last_h = tensor.alloc(numpy_floatX(0.), n_samples, self.layer_number, self.hidden_status_dim) if len(inputs) == 5: last_h = inputs[4] last_h = tensor.alloc(last_h, self.layer_number, last_h.shape[0], last_h.shape[1]).dimshuffle([1, 0, 2]) state_below = tensor.dot( tensor.concatenate([ self.sentence, tensor.alloc(self.forward_hidden_status[-1, :, :], self.sentence.shape[0], self.forward_hidden_status.shape[1], self.hidden_status_dim) ], axis=2), self.node.get_params_W()) results, _ = theano.scan( self.node.node_update, sequences=[self.mask, state_below, self.direction], outputs_info=[last_h], name=self._p(self.prefix, '_scan'), n_steps=n_steps) hidden_status_outputs = results # p = printing.Print('hidden_status_outputs') # hidden_status_outputs = p(hidden_status_outputs) return hidden_status_outputs
def get_output(self, inputs): """ Get outputs of encoder layer. Return all of the hidden status. """ (self.sentence, self.mask, self.encoder_hidden_status, self.question_mask) = inputs assert self.mask is not None n_steps = self.sentence.shape[0] if self.sentence.ndim == 3: n_samples = self.sentence.shape[1] else: n_samples = 1 # last_s = tensor.dot(self.encoder_hidden_status[0, :, self.hidden_status_dim:], # self.params[self._p(self.prefix, 'Ws')]) last_s = tensor.alloc(numpy_floatX(0.), n_samples, self.hidden_status_dim) state_below = self.sentence def upd(am_, x_, s_, h_, qm_): c, alpha = self.attention_node.node_update(s_, h_, qm_) x_ = tensor.dot(tensor.concatenate([x_, c], axis=1), self.node.get_params_W()) s = self.node.node_update(am_, x_, s_) return s, c, alpha results, _ = theano.scan( upd, sequences=[self.mask, state_below], outputs_info=[last_s, None, None], non_sequences=[self.encoder_hidden_status, self.question_mask], name=self._p(self.prefix, '_scan'), n_steps=n_steps) hidden_status_outputs, context_outputs, alpha_outputs = results return hidden_status_outputs, context_outputs, alpha_outputs
def rmsprop(self, lr, tparams, grads, model_input, cost, givens=None): """ A variant of SGD that scales the step size by running average of the recent step norms. Parameters ---------- lr : Theano SharedVariable Initial learning rate tpramas: Theano SharedVariable Model parameters grads: Theano variable Gradients of cost w.r.t to parameres x: Theano variable Model inputs mask: Theano variable Sequence mask y: Theano variable Targets cost: Theano variable Objective fucntion to minimize Notes ----- For more information, see [Hint2014]_. .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*, lecture 6a, http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf """ zipped_grads = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems() ] running_grads = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad' % k) for k, p in tparams.iteritems() ] running_grads2 = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(model_input, cost, updates=zgup + rgup + rg2up, name='rmsprop_f_grad_shared', givens=givens) updir = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.iteritems() ] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / T.sqrt(rg2 - rg**2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update', givens=givens) return f_grad_shared, f_update
def rmsprop(self, lr, tparams, grads, model_input, cost, givens=None): """ A variant of SGD that scales the step size by running average of the recent step norms. Parameters ---------- lr : tensorheano SharedVariable Initial learning rate tpramas: tensorheano SharedVariable Model parameters grads: tensorheano variable Gradients of cost w.r.t to parameres x: tensorheano variable Model inputs mask: tensorheano variable Sequence mask y: tensorheano variable tensorargets cost: tensorheano variable Objective fucntion to minimize Notes ----- For more information, see [Hint2014]_. .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*, lecture 6a, http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf """ zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(model_input, cost, updates=zgup + rgup + rg2up, name='rmsprop_f_grad_shared', givens=givens) updir = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.iteritems()] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update', givens=givens) return f_grad_shared, f_update