def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) xsum = T.dot(X, self.W_sum) + self.b_sum ### get gate's input xmax = T.dot(X, self.W_max) + self.b_max xmin = T.dot(X, self.W_min) + self.b_min xsubt = T.dot(X, self.W_subt) + self.b_subt xmul = T.dot(X, self.W_mul) + self.b_mul xres = T.dot(X, self.W_res) + self.b_res xone = T.dot(X, self.W_one) + self.b_one xi = T.dot(X, self.W_i) + self.b_i xf = T.dot(X, self.W_f) + self.b_f xc = T.dot(X, self.W_c) + self.b_c xo = T.dot(X, self.W_o) + self.b_o [outputs, memories], updates = theano.scan( self._step, sequences=[xsum, xmax, xmin, xsubt, xmul, xres, xone, xi, xf, xo, xc, padded_mask], ### update sequence input outputs_info=[ T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) ], non_sequences=[self.U_sum, self.U_max, self.U_min, self.U_subt, self.U_mul, self.U_res, self.U_one, self.U_i, self.U_f, self.U_o, self.U_c], ### add gate's weight matrix truncate_gradient=self.truncate_gradient) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) # Create X_tm1 sequence through zero left-padding Z = T.zeros_like(X) X_tm1 = T.concatenate(([Z[0]], X), axis=0) x_f = T.dot(X, self.W_xf) + self.b_f x_z = T.dot(X, self.W_xz) + self.b_z x_o = T.dot(X, self.W_xo) + self.b_o h_info = T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) c_info = T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) [outputs, cells], updates = theano.scan( self._step, sequences=[x_f, x_z, x_o, padded_mask, X_tm1], outputs_info=[h_info, c_info], non_sequences=[self.U_hf, self.U_xz, self.U_xo], truncate_gradient=self.truncate_gradient, go_backwards=self.go_backwards) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def lstm_cost(self, words): x = self.L[words] # Each element of x is (word_embed,) shape xi = T.dot(x, self.W_i) + self.b_i xf = T.dot(x, self.W_f) + self.b_f xc = T.dot(x, self.W_c) + self.b_c xo = T.dot(x, self.W_o) + self.b_o [outputs, memories], updates = theano.scan( self._step, sequences=[xi, xf, xc, xo], outputs_info=[ alloc_zeros_matrix(self.n_lstm_embed), alloc_zeros_matrix(self.n_lstm_embed), ], non_sequences=[ self.U_i, self.U_f, self.U_o, self.U_c, ], truncate_gradient=-1 ) r = T.dot(self.Lprime, outputs[-1]) return T.nnet.softmax(r)
def _get_initial_states(self, X): # batch_size = X.shape[0] # canvas = self.init_canvas.dimshuffle('x', 0, 1, 2).repeat(batch_size, # axis=0) # init_enc = self.init_h_enc.dimshuffle('x', 0).repeat(batch_size, axis=0) # init_dec = self.init_h_dec.dimshuffle('x', 0).repeat(batch_size, axis=0) canvas = alloc_zeros_matrix(*X.shape) # + self.init_canvas[None, :, :, :] init_enc = alloc_zeros_matrix(X.shape[0], self.h_dim) # + self.init_h_enc[None, :] init_dec = alloc_zeros_matrix(X.shape[0], self.h_dim) # + self.init_h_dec[None, :] return canvas, init_enc, init_dec
def get_output(self, train=False): X = self.get_input(train) X = X.dimshuffle((1, 0, 2)) # scan = theano symbolic loop. # See: http://deeplearning.net/software/theano/library/scan.html # Iterate over the first dimension of the x array (=time). [H1, H2], updates = theano.scan( self._step, sequences=[X], outputs_info=[alloc_zeros_matrix(X.shape[1], self.output_dim), dict(initial = alloc_zeros_matrix(self.sh, X.shape[1], self.output_dim), taps = [-1, -self.sh])]) if self.return_sequences: return H2.dimshuffle((1, 0, 2)) return H2[-1]
def get_output(self, train=False): X = self.get_input(train) X = X.dimshuffle((1, 0, 2)) #X_ = T.dot(X, self.W1) + self.b1 [H1, C1], updates = theano.scan( self._step, sequences=[X], outputs_info=[ alloc_zeros_matrix(X.shape[1], self.output_dim), alloc_zeros_matrix(X.shape[1], self.output_dim)], truncate_gradient=self.truncate_gradient) if self.return_sequences: return H1.dimshuffle((1, 0, 2)) return H1[-1]
def get_output(self, train): '''Transform inputs to this layer into outputs for the layer. Parameters ---------- inputs : dict of theano expressions Symbolic inputs to this layer, given as a dictionary mapping string names to Theano expressions. See :func:`base.Layer.connect`. Returns ------- outputs : dict of theano expressions A map from string output names to Theano expressions for the outputs from this layer. This layer type generates a "pre" output that gives the unit activity before applying the layer's activation function, and a "hid" output that gives the post-activation values. updates : sequence of update pairs A sequence of updates to apply to this layer's state inside a theano function. ''' X = self.get_input(train) X = X.dimshuffle((1,0,2)) x = E.tools.TT.dot(X, self.W) + self.b outputs, updates = theano.scan( self._step, sequences=[E.tools.TT.arange(x.shape[0]), x], outputs_info=alloc_zeros_matrix(X.shape[1], self.output_dim), truncate_gradient=self.truncate_gradient, ) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) X = X.dimshuffle((1, 0, 2)) # scan = theano symbolic loop. # See: http://deeplearning.net/software/theano/library/scan.html # Iterate over the first dimension of the x array (=time). [H1, H2], updates = theano.scan( self._step, sequences=[X], outputs_info=[T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1)], truncate_gradient=self.truncate_gradient) if self.return_sequences: return H2.dimshuffle((1, 0, 2)) return H2[-1]
def batchargmax(tensor, maximums): result, updates = theano.scan( fn=batchargmax_helper, sequences=[tensor,maximums], outputs_info= [alloc_zeros_matrix(tensor.shape[2])] ) return result
def argmax(tensor, maximums): [v,score], updates = theano.scan( fn=argmax2args_step, sequences=[tensor,maximums], outputs_info= [alloc_zeros_matrix(tensor.shape[1]), theano.shared(np.cast[theano.config.floatX](-9999999.0) )] ) return v[-1]
def get_forward_output(self, train): X = self.get_input(train) X = X.dimshuffle((1,0,2)) xi = T.dot(X, self.W_i) + self.b_i xf = T.dot(X, self.W_f) + self.b_f xc = T.dot(X, self.W_c) + self.b_c xo = T.dot(X, self.W_o) + self.b_o [outputs, memories], updates = theano.scan( self._forward_step, sequences=[xi, xf, xo, xc], outputs_info=[ T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) ], non_sequences=[self.U_i, self.U_f, self.U_o, self.U_c], truncate_gradient=self.truncate_gradient ) return outputs.dimshuffle((1,0,2))
def add_state(self, name, dim): if name in self.namespace: raise Exception('Duplicate node identifier: ' + name) self.namespace.add(name) self.state_order.append(name) inps = self.input if isinstance(inps, dict): batch_size = inps.values()[0].shape[0] else: batch_size = inps.shape[0] self.states[name] = T.unbroadcast(alloc_zeros_matrix(batch_size, dim), 1) self.state_config.append({'name': name, 'dim': dim})
def get_gates(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) xg = T.dot(X, self.W_g) + self.b_g xc = T.dot(X, self.W_c) + self.b_c xo = T.dot(X, self.W_o) + self.b_o [outputs, memories, gates], updates = theano.scan( self._debug_step, sequences = [xg, xo, xc, padded_mask], outputs_info=[ T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim, 2), 1) ], non_sequences=[self.U_g, self.U_o, self.U_c], truncate_gradient=self.truncate_gradient) return outputs, gates, memories
def debug_output(self, train = False, get_tuple = False): input_dict = self.get_input(train) X_encoder = input_dict['encoder_context'] X_encoder = X_encoder.reshape((X_encoder.shape[0],X_encoder.shape[1],-1)) X = input_dict['recurrent_context'] X = X.dimshuffle((1, 0, 2)) attention_encoder = T.dot(X_encoder,self.W_e2a) [outputs, contexts, attentionTotal], updates = theano.scan( self._step, sequences=[X], outputs_info=[ T.unbroadcast(alloc_zeros_matrix(X.shape[1], X.shape[0]), 1), T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.enc_dim), 1), T.unbroadcast(alloc_zeros_matrix(X.shape[1], X.shape[0], self.att_dim), 1) ], non_sequences=[X_encoder,attention_encoder], truncate_gradient=self.truncate_gradient, go_backwards=self.go_backwards) return outputs.dimshuffle((1, 0, 2)), X.dimshuffle((1,0,2)), contexts.dimshuffle((1,0,2)), attentionTotal, attention_encoder
def memnn_cost(self, statements, question, ans, pe_matrix): # statements: list of list of word indices # question: list of word indices computed_memories, updates = theano.scan( self._compute_memories, sequences = [statements], outputs_info = [ alloc_zeros_matrix(self.weights.shape[0], self.n_embedding) ], non_sequences = [ self.weights.dimshuffle(1, 0, 2), pe_matrix ], truncate_gradient = -1, ) memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2) # Embed question u1 = T.sum(self.weights[0][question], axis=0) # Layer 1 p = T.nnet.softmax(T.dot(u1, memories[0].T)) o1 = T.dot(p, memories[1]) # Layer 2 u2 = o1 + T.dot(u1, self.H) p = T.nnet.softmax(T.dot(u2, memories[1].T)) o2 = T.dot(p, memories[2]) # Layer 3 u3 = o2 + T.dot(u2, self.H) p = T.nnet.softmax(T.dot(u3, memories[2].T)) o3 = T.dot(p, memories[3]) # Score answers u4 = o3 + T.dot(u3, self.H) # Embed answer a1 = T.sum(self.A[ans[0]], axis=0) a2 = T.sum(self.A[ans[1]], axis=0) a3 = T.sum(self.A[ans[2]], axis=0) a4 = T.sum(self.A[ans[3]], axis=0) a = T.stack(a1, a2, a3, a4) scores = T.dot(T.dot(u4, self.U.T), T.dot(self.U, a.T)) #scores = T.dot(T.dot(u4, self.U.T), T.dot(self.U, a.T)) output = T.nnet.softmax(scores) return output[0]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) xsum = T.dot(X, self.W_sum) + self.b_sum xi = T.dot(X, self.W_i) + self.b_i xf = T.dot(X, self.W_f) + self.b_f xc = T.dot(X, self.W_c) + self.b_c xo = T.dot(X, self.W_o) + self.b_o [outputs, memories], updates = theano.scan( self._step, sequences=[xsum, xi, xf, xo, xc, padded_mask], outputs_info=[ T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) ], non_sequences=[self.U_sum,self.U_i, self.U_f, self.U_o, self.U_c], truncate_gradient=self.truncate_gradient) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train): X = self.get_input(train) X = X.dimshuffle((1, 0, 2)) x_t = TT.dot(X, self.W) + self.b x_gate = TT.dot(X, self.W_gate) + self.b_gate outputs, updates = theano.scan( self._step, sequences=[E.tools.TT.arange(x_t.shape[0]), x_t, x_gate], outputs_info=[alloc_zeros_matrix(X.shape[1], self.output_dim)], truncate_gradient=self.truncate_gradient ) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) x_e = T.dot(X, self.W_x2e) + self.b_x2e x_g = T.dot(X, self.W_x2g) + self.b_x2g [outputs, expert_memory], updates = theano.scan( self._step, sequences=[x_e, x_g, padded_mask], outputs_info=[ T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.n_experts, self.output_dim), 1), ], truncate_gradient=self.truncate_gradient, go_backwards=self.go_backwards, ) if self.return_sequences and self.go_backwards: return outputs[::-1].dimshuffle((1, 0, 2)) elif self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) xc = T.dot(X, self.W_c) + self.b_c xo = T.dot(X, self.W_o) + self.b_o [outputs, memories], updates = theano.scan( self._step, sequences=[xo, xc, padded_mask], outputs_info=[ T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) ], non_sequences=[self.U_o, self.U_c, self.W_maxout, self.b_maxout], truncate_gradient=self.truncate_gradient, go_backwards=self.go_backwards) if self.return_sequences and self.go_backwards: return outputs[::-1].dimshuffle((1, 0, 2)) elif self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train): X = self.get_input(train) X = X.dimshuffle((1,0,2)) x_t = TT.dot(X, self.W) + self.b x_gate = TT.dot(X, self.W_gate) + self.b_gate outputs, updates = theano.scan( self._step, sequences=[x_t, x_gate], outputs_info=[dict(initial=alloc_zeros_matrix(3, X.shape[1], self.output_dim), taps=[-1, -2, -3])], non_sequences=[self.U, self.U_gate], truncate_gradient=self.truncate_gradient ) if self.return_sequences: return outputs.dimshuffle((1,0,2)) return outputs[-1]
def get_padded_shuffled_mask(self, train, X, pad=0): mask = self.get_input_mask(train) if mask is None: mask = T.ones_like(X.sum(axis=-1)) # is there a better way to do this without a sum? # mask is (nb_samples, time) mask = T.shape_padright(mask) # (nb_samples, time, 1) mask = T.addbroadcast(mask, -1) # (time, nb_samples, 1) matrix. mask = mask.dimshuffle(1, 0, 2) # (time, nb_samples, 1) if pad > 0: # left-pad in time with 0 padding = alloc_zeros_matrix(pad, mask.shape[1], 1) mask = T.concatenate([padding, mask], axis=0) # return mask.astype('int8') return mask.astype(theano.config.floatX)
def memnn_cost(self, statements, question, pe_matrix): computed_memories, updates = theano.scan( self._compute_memories, sequences = statements, outputs_info = [ alloc_zeros_matrix(self.weights.shape[0], 4800) #init as 3 ], non_sequences = [ #self.weights.dimshuffle(1, 0, 2), self.weights, pe_matrix ], truncate_gradient = -1, ) memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2) # Embed question #s = theano.tensor.scalar('s') u1 = question #u1 = weights[0] * question #sv = skipthoughts.encode(model, sentence) # Layer 1 p = T.nnet.softmax(T.dot(u1, memories[0].T)) o1 = T.dot(p, memories[1]) # Layer 2 u2 = o1 + T.dot(u1, self.H) p = T.nnet.softmax(T.dot(u2, memories[1].T)) o2 = T.dot(p, memories[2]) # Layer 3 u3 = o2 + T.dot(u2, self.H) p = T.nnet.softmax(T.dot(u3, memories[2].T)) o3 = T.dot(p, memories[3]) # Final output = T.nnet.softmax(T.dot(o3 + u3, self.weights[3].T)) print "memnn_cost running" #return output[0, 1, 2, 3] return output[0]
def get_output(self, train): X = self.get_input(train) if X.ndim == 3: X = X.dimshuffle((1,0,2)) x_z = TT.dot(X, self.W_z) + self.b_z x_r = TT.dot(X, self.W_r) + self.b_r x_h = TT.dot(X, self.W_h) + self.b_h outputs, updates = theano.scan( self._step, sequences=[x_z, x_r, x_h], outputs_info=alloc_zeros_matrix(X.shape[1], self.output_dim), non_sequences=[self.U_z, self.U_r, self.U_h], truncate_gradient=self.truncate_gradient ) if self.return_sequences: return outputs.dimshuffle((1,0,2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) mask = self.get_padded_shuffled_mask(train, X, pad=0) X = X.dimshuffle((1, 0, 2)) Y = T.dot(X, self.W) + self.b # h0 = T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) h0 = T.repeat(self.h_m1, X.shape[1], axis=0) [outputs, _], updates = theano.scan( self._step, sequences=[Y, mask], outputs_info=[h0, T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1)], non_sequences=[self.R], truncate_gradient=self.truncate_gradient, strict=True, allow_gc=theano.config.scan.allow_gc) if self.return_sequences: return (T.concatenate(h0.dimshuffle('x', 0, 1), outputs, axis=0).dimshuffle((1, 0, 2)), mask[1:].dimshuffle(1, 0, 2)) return outputs[-1]
def memnn_cost(self, statements, question, pe_matrix): # statements: list of list of word indices # question: list of word indices computed_memories, updates = theano.scan( self._compute_memories, sequences = [statements], outputs_info = [ alloc_zeros_matrix(self.weights.shape[0], self.n_embedding) ], non_sequences = [ self.weights.dimshuffle(1, 0, 2), pe_matrix ], truncate_gradient = -1, ) memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2) # Embed question u1 = T.sum(self.weights[0][question], axis=0) # Layer 1 p = T.nnet.softmax(T.dot(u1, memories[0].T)) o1 = T.dot(p, memories[1]) # Layer 2 u2 = o1 + T.dot(u1, self.H) p = T.nnet.softmax(T.dot(u2, memories[1].T)) o2 = T.dot(p, memories[2]) # Layer 3 u3 = o2 + T.dot(u2, self.H) p = T.nnet.softmax(T.dot(u3, memories[2].T)) o3 = T.dot(p, memories[3]) # Final output = T.nnet.softmax(T.dot(o3 + u3, self.weights[3].T)) return output[0]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) # x_f = T.dot(X, self.W_xf) + self.b_f + T.dot(X_tm1[:-1], self.U_hf) # x_z = T.dot(X, self.W_xz) + self.b_z + T.dot(X_tm1[:-1], self.U_xz) # x_o = T.dot(X, self.W_xo) + self.b_o + T.dot(X_tm1[:-1], self.U_xo) x_f = self.inner_activation(T.dot(X, self.W_xf) + self.b_f) x_z = self.activation(T.dot(X, self.W_xz) + self.b_z) x_o = T.dot(X, self.W_xo) + self.b_o if self.p > 0: retain_prop = 1. - self.p if train: # x_f *= self.srng.binomial(x_f.shape, p=retain_prop, dtype=theano.config.floatX) x_z *= self.srng.binomial(x_z.shape, p=retain_prop, dtype=theano.config.floatX) x_o *= self.srng.binomial(x_o.shape, p=retain_prop, dtype=theano.config.floatX) else: x_z *= retain_prop x_o *= retain_prop # x_f *= retain_prop h_info = T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) outputs, updates = theano.scan( self._step, sequences=[x_f, x_z, x_o, padded_mask], outputs_info=[h_info], # non_sequences=[self.U_hf], truncate_gradient=self.truncate_gradient, go_backwards=self.go_backwards) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=0) X = X.dimshuffle((1, 0, 2)) xi = T.dot(X, self.W_i) + self.b_i xf = T.dot(X, self.W_f) + self.b_f xc = T.dot(X, self.W_c) + self.b_c xo = T.dot(X, self.W_o) + self.b_o # h0 = T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) h0 = T.repeat(self.h00, X.shape[1], axis=0) [outputs, _], updates = theano.scan( self._step, sequences=[xi, xf, xo, xc, padded_mask], outputs_info=[h0, T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1)], non_sequences=[self.U_i, self.U_f, self.U_o, self.U_c], truncate_gradient=self.truncate_gradient) if self.return_sequences: return (T.concatenate(h0.dimshuffle('x', 0, 1), outputs, axis=0).dimshuffle((1, 0, 2)), padded_mask[1:].dimshuffle(1, 0, 2)) return outputs[-1]
def get_initial_states(self, inputs): # u_init = alloc_zeros_matrix(inputs.shape[0], self.causes_dim) + .1 u_init = theano_rng.uniform(low=0, high=1, size=(inputs.shape[0], self.causes_dim)) return (alloc_zeros_matrix(inputs.shape[0], self.output_dim), u_init)
def get_initial_states(self, inputs): u_init = alloc_zeros_matrix(inputs.shape[0], self.causes_dim) + .1 return (alloc_zeros_matrix(inputs.shape[0], self.output_dim), u_init)
def get_initial_states(self, X): return alloc_zeros_matrix(X.shape[0], self.stack_size, self.code_row, self.code_col)
def get_initial_states(self, X): return alloc_zeros_matrix(X.shape[0], self.output_dim)