def __init__(self, input_size, hidden_size, output_size): self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size x = tensor.tensor3('x', dtype=floatX) y = tensor.tensor3('y', dtype=floatX) x_to_lstm = Linear(name="x_to_lstm", input_dim=input_size, output_dim=4 * hidden_size, weights_init=IsotropicGaussian(), biases_init=Constant(0)) lstm = LSTM(dim=hidden_size, name="lstm", weights_init=IsotropicGaussian(), biases_init=Constant(0)) lstm_to_output = Linear(name="lstm_to_output", input_dim=hidden_size, output_dim=output_size, weights_init=IsotropicGaussian(), biases_init=Constant(0)) x_transform = x_to_lstm.apply(x) h, c = lstm.apply(x_transform) y_hat = lstm_to_output.apply(h) y_hat = Logistic(name="y_hat").apply(y_hat) self.cost = BinaryCrossEntropy(name="cost").apply(y, y_hat) x_to_lstm.initialize() lstm.initialize() lstm_to_output.initialize() self.computation_graph = ComputationGraph(self.cost)
def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs): dataset_train = IterableDataset(generate_data(max_seq_length, batch_size, num_batches)) dataset_test = IterableDataset(generate_data(max_seq_length, batch_size, 100)) stream_train = DataStream(dataset=dataset_train) stream_test = DataStream(dataset=dataset_test) x = T.tensor3('x') y = T.matrix('y') # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(1, lstm_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(lstm_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(lstm_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) h, c = lstm.apply(x_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) y_hat = Logistic().apply(y_hat) cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstm.initialize() x_to_h.initialize() h_to_o.initialize() cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) test_monitor = DataStreamMonitoring(variables=[cost], data_stream=stream_test, prefix="test") train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) main_loop = MainLoop(algorithm, stream_train, extensions=[test_monitor, train_monitor, FinishAfter(after_n_epochs=num_epochs), Printing(), ProgressBar()]) main_loop.run() print 'Learned weights:' for layer in (x_to_h, lstm, h_to_o): print "Layer '%s':" % layer.name for param in layer.parameters: print param.name, ': ', param.get_value() print
def create_model(self): input_dim = self.input_dim x = self.x y = self.y p = self.p mask = self.mask hidden_dim = self.hidden_dim embedding_dim = self.embedding_dim lookup = LookupTable(self.dict_size, embedding_dim, weights_init=IsotropicGaussian(0.001), name='LookupTable') x_to_h = Linear(embedding_dim, hidden_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) lstm = LSTM(hidden_dim, name='lstm', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) h_to_o = MLP([Logistic()], [hidden_dim, 1], weights_init=IsotropicGaussian(0.001), biases_init=Constant(0), name='h_to_o') lookup.initialize() x_to_h.initialize() lstm.initialize() h_to_o.initialize() embed = lookup.apply(x).reshape( (x.shape[0], x.shape[1], self.embedding_dim)) embed.name = "embed_vec" x_transform = x_to_h.apply(embed.transpose(1, 0, 2)) x_transform.name = "Transformed X" self.lookup = lookup self.x_to_h = x_to_h self.lstm = lstm self.h_to_o = h_to_o #if mask is None: h, c = lstm.apply(x_transform) #else: #h, c = lstm.apply(x_transform, mask=mask) h.name = "hidden_state" c.name = "cell state" # only values of hidden units of the last timeframe are used for # the classification indices = T.sum(mask, axis=0) - 1 rel_hid = h[indices, T.arange(h.shape[1])] out = self.h_to_o.apply(rel_hid) probs = out return probs
class questionEncoder: def __init__(self, word_dim, hidden_dim): self.forward_lstm= LSTM(hidden_dim, name='question_forward_lstm', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.backward_lstm= LSTM(hidden_dim, name='question_backward_lstm', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.x_to_h_forward = Linear(word_dim, hidden_dim * 4, name='word_x_to_h_forward', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.x_to_h_backward = Linear(word_dim, hidden_dim * 4, name='word_x_to_h_backward', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.forward_lstm.initialize() self.backward_lstm.initialize() self.x_to_h_forward.initialize() self.x_to_h_backward.initialize() # variable question length # words: batch_size x q x word_dim # words_reverse: be the reverse sentence of words # padding with 0 to max length q # mask: batch_size def apply(self, words, words_reverse, mask_, batch_size): mask = mask_.flatten() # batch_size x q x hidden_dim Wx = self.x_to_h_forward.apply(words) Wx_r = self.x_to_h_backward.apply(words_reverse) # q x batch_size x hidden_dim Wx = Wx.swapaxes(0, 1) Wx_r = Wx_r.swapaxes(0, 1) # q x batch_size x hidden_dim hf, cf = self.forward_lstm.apply(Wx) hb, cb = self.backward_lstm.apply(Wx_r) for i in range(batch_size): T.set_subtensor(hb[0:mask[i]+1, i, :], hb[0:mask[i]+1, i, :][::-1]) # q x batch_size x (2 x hidden_dim) h = T.concatenate([hf, hb], axis=2) # batch_size x hidden_dim y_q = hf[mask, range(batch_size), :] y_1 = hb[0, range(batch_size), :] return h.swapaxes(0, 1), y_q, y_1
def example4(): """LSTM -> Plante lors de l'initialisation du lstm.""" x = tensor.tensor3('x') dim = 3 # gate_inputs = theano.function([x],x*4) gate_inputs = Linear(input_dim=dim, output_dim=dim * 4, name="linear", weights_init=initialization.Identity(), biases_init=Constant(2)) lstm = LSTM(dim=dim, activation=Tanh(), weights_init=IsotropicGaussian(), biases_init=Constant(0)) gate_inputs.initialize() hg = gate_inputs.apply(x) #print(gate_inputs.parameters) #print(gate_inputs.parameters[1].get_value()) lstm.initialize() h, cells = lstm.apply(hg) print(lstm.parameters) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) print(f(4 * np.ones((dim, 1, dim), dtype=theano.config.floatX))) print("Good Job!") # lstm_output = #Initial State h0 = tensor.matrix('h0') c = tensor.matrix('cells') h, c1 = lstm.apply( inputs=x, states=h0, cells=c) # lstm.apply(states=h0,cells=cells,inputs=gate_inputs) f = theano.function([x, h0, c], h) print("a") print( f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512): self.hidden_size = hidden_size self.input1_size = input1_size self.input2_size = input2_size self.lookup1_dim = lookup1_dim self.lookup2_dim = lookup2_dim x1 = tensor.lmatrix('durations') x2 = tensor.lmatrix('syllables') y = tensor.lmatrix('pitches') lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup1.initialize() lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup2.initialize() merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() recurrent_block = LSTM(dim=self.hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3) recurrent_block.initialize() linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear.initialize() softmax = NDimensionalSoftmax() l1 = lookup1.apply(x1) l2 = lookup2.apply(x2) m = merge.apply(l1, l2) h = recurrent_block.apply(m) a = linear.apply(h) y_hat = softmax.apply(a, extra_ndim=1) # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D) self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean() self.ComputationGraph = ComputationGraph(self.Cost) self.Model = Model(y_hat)
class seqDecoder: def __init__(self, feature_dim, memory_dim, fc1_dim, fc2_dim): self.W = Linear(input_dim=feature_dim, output_dim=memory_dim * 4, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='seqDecoder_W') self.GRU_A = LSTM(feature_dim, name='seqDecoder_A', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.GRU_B = LSTM(memory_dim, name='seqDecoder_B', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.W.initialize() self.GRU_A.initialize() self.GRU_B.initialize() self.fc1 = Linear(input_dim=memory_dim, output_dim=fc1_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name='fc1') self.fc2 = Linear(input_dim=fc1_dim, output_dim=fc2_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name='fc2') self.fc1.initialize() self.fc2.initialize() # A: the encoding of GRU_A, # B: the encoding of GRU_B # padding: the tensor constant def apply(self, output_length, A, B, padding): A_, garbage = self.GRU_A.apply(padding, states=A) WA_ = self.W.apply(A_) # output_length x batch_size x output_dim B_, garbage = self.GRU_B.apply(WA_, states=B) # batch_size x output_length x output_dim B_ = B_.swapaxes(0,1) fc1_r = relu(self.fc1.apply(B_)) fc2_r = relu(self.fc2.apply(fc1_r)) return fc2_r
def example4(): """LSTM -> Plante lors de l'initialisation du lstm.""" x = tensor.tensor3('x') dim=3 # gate_inputs = theano.function([x],x*4) gate_inputs = Linear(input_dim=dim,output_dim=dim*4, name="linear",weights_init=initialization.Identity(), biases_init=Constant(2)) lstm = LSTM(dim=dim,activation=Tanh(), weights_init=IsotropicGaussian(), biases_init=Constant(0)) gate_inputs.initialize() hg = gate_inputs.apply(x) #print(gate_inputs.parameters) #print(gate_inputs.parameters[1].get_value()) lstm.initialize() h, cells = lstm.apply(hg) print(lstm.parameters) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) print(f(4*np.ones((dim, 1, dim), dtype=theano.config.floatX))) print("Good Job!") # lstm_output = #Initial State h0 = tensor.matrix('h0') c = tensor.matrix('cells') h,c1 = lstm.apply(inputs=x, states=h0, cells=c) # lstm.apply(states=h0,cells=cells,inputs=gate_inputs) f = theano.function([x, h0, c], h) print("a") print(f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
def build_theano_functions(self) : #import pdb ; pdb.set_trace() x = T.fmatrix('x') s = T.fvector('s') mu = T.fvector('mu') mu = T.reshape(mu,(self.number_of_mix,1)) pi = T.fvector('pi') lstm = LSTM( dim=self.input_dim/4, weights_init=IsotropicGaussian(0.5), biases_init=Constant(1)) lstm.initialize() h, c = lstm.apply(x) h = h[0][0][-1] LL = T.sum(pi*(1./(T.sqrt(2.*np.pi)*s))*T.exp(\ -0.5*(h-mu)**2/T.reshape(s,(self.number_of_mix,1))**2.).sum(axis=1)) cost = -T.log(LL) #cg = ComputationGraph(cost) #self.cg = cg #parameters = cg.parameters model = Model(cost) self.model = model parameters = model.parameters grads = T.grad(cost, parameters) updates = [] for i in range(len(grads)) : updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) gradf = theano.function([x,s,mu,pi],[cost],updates=updates) f = theano.function([x],[h]) return gradf, f
def build_theano_functions(self): #import pdb ; pdb.set_trace() x = T.fmatrix('x') s = T.fvector('s') mu = T.fvector('mu') mu = T.reshape(mu, (self.number_of_mix, 1)) pi = T.fvector('pi') lstm = LSTM(dim=self.input_dim / 4, weights_init=IsotropicGaussian(0.5), biases_init=Constant(1)) lstm.initialize() h, c = lstm.apply(x) h = h[0][0][-1] LL = T.sum(pi*(1./(T.sqrt(2.*np.pi)*s))*T.exp(\ -0.5*(h-mu)**2/T.reshape(s,(self.number_of_mix,1))**2.).sum(axis=1)) cost = -T.log(LL) #cg = ComputationGraph(cost) #self.cg = cg #parameters = cg.parameters model = Model(cost) self.model = model parameters = model.parameters grads = T.grad(cost, parameters) updates = [] for i in range(len(grads)): updates.append( tuple([parameters[i], parameters[i] - self.lr * grads[i]])) gradf = theano.function([x, s, mu, pi], [cost], updates=updates) f = theano.function([x], [h]) return gradf, f
def build_theano_functions(self, data_mean, data_std) : x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor3('y') # before the cell, input, forget and output gates, x needs to # be transformed linear_transforms = [] for transform in ['c','i','f','o'] : linear_transforms.append( Linear(self.input_dim, self.lstm_dim, weights_init=Uniform(mean=data_mean, std=data_std), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(data_mean), name=transform+"_transform") ) for transform in linear_transforms : transform.initialize() linear_applications = [] for transform in linear_transforms : linear_applications.append( transform.apply(x)) lstm_input = T.concatenate(linear_applications, axis=2) # the lstm wants batch X time X value lstm = LSTM( dim=self.lstm_dim, weights_init=IsotropicGaussian(mean=0.5,std=1), biases_init=Constant(1)) lstm.initialize() h, _dummy = lstm.apply(lstm_input) # this is where Alex Graves' paper starts output_transform = Linear(self.lstm_dim, self.output_dim, #weights_init=Uniform(mean=data_mean, std=data_std), weights_init=IsotropicGaussian(mean=0., std=1), biases_init=Constant(1), name="output_transform") output_transform.initialize() y_hat = output_transform.apply(h) # transforms to find each gmm params (mu, pi, sig) #pis = NDimensionalSoftmax.apply(y_hat[:,:,0:self.gmm_dim]) # small hack to softmax a 3D tensor pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim))) , (self.batch_dim, self.time_dim, self.gmm_dim)) #sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2]) sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1 mus = y_hat[:,:,self.gmm_dim*2:] pis = pis[:,:,:,np.newaxis] mus = mus[:,:,:,np.newaxis] sig = sig[:,:,:,np.newaxis] y = y[:,:,np.newaxis,:] #sig=theano.printing.Print()(sig) # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time #LL = -T.log((pis*(1./(T.sqrt(2.*np.pi)*sig))*T.exp(-0.5*((y-mus)**2)/sig**2)).sum(axis=2)).sum() expo = T.exp(-0.5*((y-mus)**2)/sig**2) test_expo = theano.function([x,y],[expo, mus, sig]) return test_expo coeff = pis*(1./(T.sqrt(2.*np.pi)*sig)) inside_log = (coeff*expo).sum(axis=2) LL = -(T.log(inside_log)).sum() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] for i in range(len(grads)) : updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) gradf = theano.function([x, y],[LL],updates=updates) f = theano.function([x],[pis, sig, mus]) return gradf, f
def create_model(self): input_dim = self.input_dim x = self.x y = self.y p = self.p mask = self.mask hidden_dim = self.hidden_dim embedding_dim = self.embedding_dim lookup = LookupTable(self.dict_size, embedding_dim, weights_init=IsotropicGaussian(0.001), name='LookupTable') x_to_h = Linear(embedding_dim, hidden_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) lstm = LSTM(hidden_dim, name='lstm', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) h_to_o = MLP([Logistic()], [hidden_dim, 1], weights_init=IsotropicGaussian(0.001), biases_init=Constant(0), name='h_to_o') lookup.initialize() x_to_h.initialize() lstm.initialize() h_to_o.initialize() embed = lookup.apply(x).reshape( (x.shape[0], x.shape[1], self.embedding_dim)) embed.name = "embed_vec" x_transform = x_to_h.apply(embed.transpose(1, 0, 2)) x_transform.name = "Transformed X" self.lookup = lookup self.x_to_h = x_to_h self.lstm = lstm self.h_to_o = h_to_o #if mask is None: h, c = lstm.apply(x_transform) #else: #h, c = lstm.apply(x_transform, mask=mask) h.name = "hidden_state" c.name = "cell state" # only values of hidden units of the last timeframe are used for # the classification indices = T.sum(mask, axis=0) - 1 rel_hid = h[indices, T.arange(h.shape[1])] out = self.h_to_o.apply(rel_hid) probs = 1 - out probs.name = "probability" y = y.dimshuffle(0, 'x') # Create the if-else cost function pos_ex = (y * probs) / p neg_ex = (1 - y) * (1 - probs) / np.float32(1 - p) reward = pos_ex + neg_ex cost = reward # Negative of reward cost.name = "cost" return cost
def build_theano_functions(self) : x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor3('y') z = T.ftensor3('z') layers_input = [x] dims =np.array([self.input_dim]) for dim in self.lstm_layers_dim : dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)) : # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer+1]*4, #weights_init=Uniform(mean=data_mean, std=1), weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear"+str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM( dim=dims[layer+1], weights_init=IsotropicGaussian(mean=0.,std=0.5), biases_init=Constant(1), name="lstm"+str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value(Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # the idea is to have one gaussian parametrize every frequency bin print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, weights_init=IsotropicGaussian(mean=0., std=1), biases_init=Constant(0), #use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1 : print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else : y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2)) sig = T.nnet.relu(y_hat[:,:,:self.output_dim/2])+0.05 mus = y_hat[:,:,self.output_dim/2:] # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time inside_expo = -0.5*((y-mus)**2)/sig**2 expo = T.exp(inside_expo) coeff = 1./(T.sqrt(2.*np.pi)*sig) inside_log = T.log(coeff*expo) inside_log_max = T.max(inside_log, axis=2, keepdims=True) LL = -(inside_log_max + T.log(T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum() #zinside_expo = -0.5*((z-mus)**2)/sig**2 #zexpo = T.exp(zinside_expo) #zcoeff = pis*(1./(T.sqrt(2.*np.pi)*sig)) #zinside_log = (zcoeff*zexpo).sum(axis=2) #zLL = -(T.log(zinside_log)).sum() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)) : #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append(tuple([parameters[i], parameters[i] - lr*grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug : gradf = theano.function([x, y, lr],[LL, mus, sig],updates=updates) else : #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr],[LL],updates=updates) f = theano.function([x],[sig, mus]) return gradf, f
def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs): dataset_train = IterableDataset( generate_data(max_seq_length, batch_size, num_batches)) dataset_test = IterableDataset( generate_data(max_seq_length, batch_size, 100)) stream_train = DataStream(dataset=dataset_train) stream_test = DataStream(dataset=dataset_test) x = T.tensor3('x') y = T.matrix('y') # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(1, lstm_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(lstm_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(lstm_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) h, c = lstm.apply(x_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) y_hat = Logistic().apply(y_hat) cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstm.initialize() x_to_h.initialize() h_to_o.initialize() cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) test_monitor = DataStreamMonitoring(variables=[cost], data_stream=stream_test, prefix="test") train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) main_loop = MainLoop(algorithm, stream_train, extensions=[ test_monitor, train_monitor, FinishAfter(after_n_epochs=num_epochs), Printing(), ProgressBar() ]) main_loop.run() print('Learned weights:') for layer in (x_to_h, lstm, h_to_o): print("Layer '%s':" % layer.name) for param in layer.parameters: print(param.name, ': ', param.get_value()) print() return main_loop
def build_theano_functions(self, data_mean, data_std): x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor3('y') # before the cell, input, forget and output gates, x needs to # be transformed linear_transforms = [] for transform in ['c', 'i', 'f', 'o']: linear_transforms.append( Linear( self.input_dim, self.lstm_dim, weights_init=Uniform(mean=data_mean, std=data_std), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(data_mean), name=transform + "_transform")) for transform in linear_transforms: transform.initialize() linear_applications = [] for transform in linear_transforms: linear_applications.append(transform.apply(x)) lstm_input = T.concatenate(linear_applications, axis=2) # the lstm wants batch X time X value lstm = LSTM(dim=self.lstm_dim, weights_init=IsotropicGaussian(mean=0.5, std=1), biases_init=Constant(1)) lstm.initialize() h, _dummy = lstm.apply(lstm_input) # this is where Alex Graves' paper starts output_transform = Linear( self.lstm_dim, self.output_dim, #weights_init=Uniform(mean=data_mean, std=data_std), weights_init=IsotropicGaussian(mean=0., std=1), biases_init=Constant(1), name="output_transform") output_transform.initialize() y_hat = output_transform.apply(h) # transforms to find each gmm params (mu, pi, sig) #pis = NDimensionalSoftmax.apply(y_hat[:,:,0:self.gmm_dim]) # small hack to softmax a 3D tensor pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:, :, 0:self.gmm_dim], (self.time_dim * self.batch_dim, self.gmm_dim))), (self.batch_dim, self.time_dim, self.gmm_dim)) #sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2]) sig = T.nnet.relu(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 0.1 mus = y_hat[:, :, self.gmm_dim * 2:] pis = pis[:, :, :, np.newaxis] mus = mus[:, :, :, np.newaxis] sig = sig[:, :, :, np.newaxis] y = y[:, :, np.newaxis, :] #sig=theano.printing.Print()(sig) # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time #LL = -T.log((pis*(1./(T.sqrt(2.*np.pi)*sig))*T.exp(-0.5*((y-mus)**2)/sig**2)).sum(axis=2)).sum() expo = T.exp(-0.5 * ((y - mus)**2) / sig**2) test_expo = theano.function([x, y], [expo, mus, sig]) return test_expo coeff = pis * (1. / (T.sqrt(2. * np.pi) * sig)) inside_log = (coeff * expo).sum(axis=2) LL = -(T.log(inside_log)).sum() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] for i in range(len(grads)): updates.append( tuple([parameters[i], parameters[i] - self.lr * grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) gradf = theano.function([x, y], [LL], updates=updates) f = theano.function([x], [pis, sig, mus]) return gradf, f
class iwLayer: def __init__(self, feature_dim, hidden_dim, output_dim): self.image_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_image_embed') self.word_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_word_embed') self.r_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_r_embed') self.m_to_s = Linear(input_dim=hidden_dim, output_dim=1, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_m_to_s') self.attention_dist = Softmax(name='iw_attetion') self.r_to_r = Linear(input_dim=feature_dim, output_dim=feature_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_r_to_r') # self.r_to_g = Linear(input_dim=feature_dim, # output_dim=output_dim, # weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0), # use_bias=False, # name='iw_r_to_g') self.image_embed.initialize() self.word_embed.initialize() self.r_embed.initialize() self.m_to_s.initialize() self.r_to_r.initialize() # self.r_to_g.initialize() self.seq = LSTM(feature_dim, name='rereader_seq', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.seq_embed = Linear(feature_dim, output_dim * 4, name='rereader_seq_embed', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False) self.seq.initialize() self.seq_embed.initialize() # video: batch_size x video_length x feature_dim # query: batch_size x q x feature_dim # mask: this mask is different from other masks # batch_size x q # eg. # -10000 == -np.Inf # 1: 0, 0, 0, 0, 0, -10000, -10000, -10000 # 2: 0, 0, 0, 0, -10000, -10000, -10000 # 3: 0, 0, 0, 0, 0, 0, 0, -10000 def apply(self, video, query, mask, batch_size): # batch_size x q x hidden_dim att1 = self.word_embed.apply(query) def one_step(y_d_i, r_1, y_q, y_q_m): # batch_size x hidden_dim att2 = self.r_embed.apply(r_1) att3 = self.image_embed.apply(y_d_i) att = y_q_m + att2.dimshuffle(0, 'x', 1) + att3.dimshuffle(0, 'x', 1) # batch_size x q x hidden_dim m = T.tanh(att) # batch_size x q s = self.m_to_s.apply(m) s = s.reshape((s.shape[0], s.shape[1])) # ignore the question padding 0s s = s + mask s = self.attention_dist.apply(s) y_q_s = y_q.swapaxes(1, 2) return T.batched_dot(y_q_s, s) + T.tanh(self.r_to_r.apply(r_1)) # r: video_length x batch_size x feature_dim r, updates = theano.scan(fn=one_step, sequences=[video.swapaxes(0, 1)], outputs_info=T.zeros_like(video[:, 0, :]), non_sequences=[query, att1], n_steps=video.shape[1], name='iw layer') # video_length x batch_size x output_dim Wr = self.seq_embed.apply(r) seq_r, garbage = self.seq.apply(Wr) # batch_size x feature_dim r_V = r[-1, :, :] # batch_size x output_dim seq_r_V = seq_r[-1, :, :] return r_V, seq_r_V
def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512): self.hidden_size = hidden_size self.input1_size = input1_size self.input2_size = input2_size self.lookup1_dim = lookup1_dim self.lookup2_dim = lookup2_dim x1 = tensor.lmatrix('durations') x2 = tensor.lmatrix('syllables') y = tensor.lmatrix('pitches') lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup1.initialize() lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup2.initialize() merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() recurrent_block = LSTM( dim=self.hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01) ) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3) recurrent_block.initialize() linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear.initialize() softmax = NDimensionalSoftmax() l1 = lookup1.apply(x1) l2 = lookup2.apply(x2) m = merge.apply(l1, l2) h = recurrent_block.apply(m) a = linear.apply(h) y_hat = softmax.apply(a, extra_ndim=1) # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D) self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean() self.ComputationGraph = ComputationGraph(self.Cost) self.Model = Model(y_hat)
def build_theano_functions(self): x = T.fmatrix('time_sequence') x = x.reshape((self.batch_dim, self.sequence_dim, self.time_dim)) y = x[:, 1:self.sequence_dim, :] x = x[:, :self.sequence_dim - 1, :] # if we try to include the spectrogram features spec_dims = 0 if self.image_size is not None: print "Convolution activated" self.init_conv() spec = T.ftensor4('spectrogram') spec_features, spec_dims = self.conv.build_conv_layers(spec) print "Conv final dims =", spec_dims spec_dims = np.prod(spec_dims) spec_features = spec_features.reshape( (self.batch_dim, self.sequence_dim - 1, spec_dims)) x = T.concatenate([x, spec_features], axis=2) layers_input = [x] dims = np.array([self.time_dim + spec_dims]) for dim in self.lstm_layers_dim: dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)): # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer + 1] * 4, weights_init=Orthogonal(self.orth_scale), biases_init=Constant(0), name="linear" + str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X sequence X time lstm = LSTM(dim=dims[layer + 1], weights_init=IsotropicGaussian(mean=0., std=0.5), biases_init=Constant(1), name="lstm" + str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale * Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1: print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else: y_hat = output_transform.apply( T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor pis = T.reshape( T.nnet.softmax( T.reshape( y_hat[:, :, :self.gmm_dim], ((self.sequence_dim - 1) * self.batch_dim, self.gmm_dim))), (self.batch_dim, (self.sequence_dim - 1), self.gmm_dim)) sig = T.exp(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 1e-6 mus = y_hat[:, :, self.gmm_dim * 2:] pis = pis[:, :, :, np.newaxis] mus = mus[:, :, :, np.newaxis] sig = sig[:, :, :, np.newaxis] y = y[:, :, np.newaxis, :] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5 * ((y - mus)**2) / sig**2 coeff = T.log(T.maximum(1. / (T.sqrt(2. * np.pi) * sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum( sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum( T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True)) ).mean() LL.name = "summed_likelihood" model = Model(LL) self.model = model parameters = model.parameters algorithm = GradientDescent(cost=LL, parameters=model.parameters, step_rule=Adam()) f = theano.function([x], [pis, sig, mus]) return algorithm, f
def build_theano_functions(self) : x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor4('y') layers_input = [x] dims =np.array([self.time_dim]) for dim in self.lstm_layers_dim : dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)) : # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer+1]*4, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear"+str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM( dim=dims[layer+1], weights_init=IsotropicGaussian(mean=0.,std=0.5), biases_init=Constant(1), name="lstm"+str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=0., std=1), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1 : print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else : y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor #pis = T.reshape( # T.nnet.softmax( # T.nnet.sigmoid( # T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim)))), # (self.batch_dim, self.time_dim, self.gmm_dim)) pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:,:,:self.gmm_dim], (self.sequence_dim*self.batch_dim, self.gmm_dim))), (self.batch_dim, self.sequence_dim, self.gmm_dim)) sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+1e-6 #sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1 #mus = 2.*T.tanh(y_hat[:,:,self.gmm_dim*2:]) mus = y_hat[:,:,self.gmm_dim*2:] pis = pis[:,:,:,np.newaxis] mus = mus[:,:,:,np.newaxis] sig = sig[:,:,:,np.newaxis] #y = y[:,:,np.newaxis,:] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5*((y-mus)**2)/sig**2 coeff = T.log(T.maximum(1./(T.sqrt(2.*np.pi)*sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum(sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum(T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))).mean() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)) : #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append(tuple([parameters[i], parameters[i] - lr*grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug : gradf = theano.function([x, y, lr],[LL, pis, mus, sig],updates=updates) else : #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr],[LL],updates=updates) f = theano.function([x],[pis, sig, mus]) return gradf, f
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') y = tensor.imatrix('targets') x_int = x.astype(dtype='int32').T train_dataset = IMDB() idx_sort = numpy.argsort( [len(s) for s in train_dataset.indexables[ train_dataset.sources.index('features')]] ) n_voc = len(train_dataset.dict.keys()) for idx in xrange(len(train_dataset.sources)): train_dataset.indexables[idx] = train_dataset.indexables[idx][idx_sort] n_h = 100 linear_embedding = LookupTable( length=n_voc, dim=4 * n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = LSTM( dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim=n_h, output_dim=1, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = linear_embedding.apply(x_int) * tensor.shape_padright(m.T) rnn_out = rnn.apply(embedding) rnn_out_mean_pooled = rnn_out[0][-1] probs = Sigmoid().apply( score_layer.apply(rnn_out_mean_pooled)) cost = - (y * tensor.log(probs) + (1 - y) * tensor.log(1 - probs) ).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5) ).mean() misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule( components=[StepClipping(threshold=10.), Adam() ] ) ) n_train = int(numpy.floor(.8 * train_dataset.num_examples)) n_valid = int(numpy.floor(.1 * train_dataset.num_examples)) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(n_train), batch_size=10, ) ), mask_sources=('features',) ) valid_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(n_train, n_train + n_valid), batch_size=10, ) ), mask_sources=('features',) ) test_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(n_train + n_valid, train_dataset.num_examples), batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(DataStreamMonitoring( [cost, misclassification], test_data_stream, prefix='test')) extensions.append(DataStreamMonitoring( [cost, misclassification], valid_data_stream, prefix='valid')) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification', 'valid_cost', 'valid_misclassification']], titles=['Costs'])) extensions.append(PlotManager('IMDB classification example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def build_theano_functions(self): x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor3('y') z = T.ftensor3('z') layers_input = [x] dims = np.array([self.input_dim]) for dim in self.lstm_layers_dim: dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)): # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear( dims[layer], dims[layer + 1] * 4, #weights_init=Uniform(mean=data_mean, std=1), weights_init=IsotropicGaussian(mean=1., std=1), biases_init=Constant(0), name="linear" + str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM(dim=dims[layer + 1], weights_init=IsotropicGaussian(mean=0., std=0.5), biases_init=Constant(1), name="lstm" + str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value(Orthogonal().generate( np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # the idea is to have one gaussian parametrize every frequency bin print "Last linear transform dim :", dims[1:].sum() output_transform = Linear( dims[1:].sum(), self.output_dim, weights_init=IsotropicGaussian(mean=0., std=1), biases_init=Constant(0), #use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1: print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else: y_hat = output_transform.apply( T.concatenate(layers_input[1:], axis=2)) sig = T.nnet.relu(y_hat[:, :, :self.output_dim / 2]) + 0.05 mus = y_hat[:, :, self.output_dim / 2:] # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time inside_expo = -0.5 * ((y - mus)**2) / sig**2 expo = T.exp(inside_expo) coeff = 1. / (T.sqrt(2. * np.pi) * sig) inside_log = T.log(coeff * expo) inside_log_max = T.max(inside_log, axis=2, keepdims=True) LL = -(inside_log_max + T.log( T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum() #zinside_expo = -0.5*((z-mus)**2)/sig**2 #zexpo = T.exp(zinside_expo) #zcoeff = pis*(1./(T.sqrt(2.*np.pi)*sig)) #zinside_log = (zcoeff*zexpo).sum(axis=2) #zLL = -(T.log(zinside_log)).sum() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)): #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append( tuple([parameters[i], parameters[i] - lr * grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug: gradf = theano.function([x, y, lr], [LL, mus, sig], updates=updates) else: #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr], [LL], updates=updates) f = theano.function([x], [sig, mus]) return gradf, f
def main(): x = T.imatrix('features') m = T.matrix('features_mask') y = T.imatrix('targets') #x_int = x.astype(dtype='int32').T x_int = x.T train_dataset = IMDB('train') n_voc = len(train_dataset.dict.keys()) n_h = 2 lookup = LookupTable( length=n_voc+2, dim = n_h*4, weights_init = Uniform(std=0.01), biases_init = Constant(0.) ) lookup.initialize() #rnn = SimpleRecurrent( #dim = n_h, #activation=Tanh(), #weights_init = Uniform(std=0.01), #biases_init = Constant(0.) #) rnn = LSTM( dim = n_h, activation=Tanh(), weights_init = Uniform(std=0.01), biases_init = Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim = n_h, output_dim = 1, weights_init = Uniform(std=0.01), biases_init = Constant(0.)) score_layer.initialize() embedding = lookup.apply(x_int) * T.shape_padright(m.T) #embedding = lookup.apply(x_int) + m.T.mean()*0 #embedding = lookup.apply(x_int) + m.T.mean()*0 rnn_states = rnn.apply(embedding, mask=m.T) #rnn_states, rnn_cells = rnn.apply(embedding) rnn_out_mean_pooled = rnn_states[-1] #rnn_out_mean_pooled = rnn_states.mean() probs = Sigmoid().apply( score_layer.apply(rnn_out_mean_pooled)) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost = cost, params=params, step_rule = CompositeRule([ StepClipping(threshold=10), Adam(), #AdaDelta(), ]) ) # ======== test_dataset = IMDB('test') batch_size = 64 n_train = train_dataset.num_examples train_stream = DataStream( dataset=train_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) train_padded = Padding( data_stream=train_stream, mask_sources=('features',) #mask_sources=[] ) test_stream = DataStream( dataset=test_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) test_padded = Padding( data_stream=test_stream, mask_sources=('features',) #mask_sources=[] ) #import ipdb #ipdb.set_trace() #====== model = Model(cost) extensions = [] extensions.append(EpochProgress(batch_per_epoch=train_dataset.num_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True )) extensions.append(DataStreamMonitoring( [cost, misclassification], data_stream=test_padded, prefix='test', after_epoch=True )) extensions.append(Timing()) extensions.append(Printing()) main_loop = MainLoop( model=model, data_stream=train_padded, algorithm=algorithm, extensions=extensions) main_loop.run()
class impatientLayer: # both visual and word feature are in the joint space # of dim: feature_dim # hidden_dim: dim of m # output_dim: final joint document query representation dim def __init__(self, feature_dim, hidden_dim, output_dim): self.image_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='image_embed') self.word_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='word_embed') self.r_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='r_embed') self.m_to_s = Linear(input_dim=hidden_dim, output_dim=1, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='m_to_s') self.attention_dist = Softmax(name='attention_dist_softmax') self.r_to_r = Linear(input_dim=feature_dim, output_dim=feature_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='r_to_r') # self.r_to_g = Linear(input_dim=feature_dim, # output_dim=output_dim, # weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0), # use_bias=False, # name='r_to_g') self.image_embed.initialize() self.word_embed.initialize() self.r_embed.initialize() self.m_to_s.initialize() self.r_to_r.initialize() # self.r_to_g.initialize() # the sequence to sequence LSTM self.seq = LSTM(output_dim, name='rewatcher_seq', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.seq_embed = Linear(feature_dim, output_dim * 4, name='rewatcher_seq_embed', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False) self.seq.initialize() self.seq_embed.initialize() # doc: row major batch_size x doc_length x feature_dim # query: row major batch_size x q x feature_dim # mask: mask of query batch_size # mask: length of a sentence - 1 def apply(self, doc, query, mask_, batch_size): # batch_size x doc_length x hidden_dim mask = mask_.flatten() att1 = self.image_embed.apply(doc) # y_q_i: the ith token of question # batch_size x feature_dim # r_1: r_m_1 # batch_size x feature_dim # y_d: document # batch_size x doc_length x feature_dim # y_d_m: d-to-m # batch_size x doc_length x hidden_dim def one_step(y_q_i, r_1, y_d, y_d_m): # batch_size x hidden_dim att2 = self.r_embed.apply(r_1) # batch_size x hidden_dim att3 = self.word_embed.apply(y_q_i) att = y_d_m + att2.dimshuffle(0, 'x', 1) + att3.dimshuffle(0, 'x', 1) # batch_size x doc_length x hidden_dim m = T.tanh(att) # batch_size x doc_length x 1 s = self.m_to_s.apply(m) # batch_size x doc_length s = s.reshape((s.shape[0], s.shape[1])) s = self.attention_dist.apply(s) y_d_s = y_d.swapaxes(1, 2) # return batch_size x feature_dim return T.batched_dot(y_d_s, s) + T.tanh(self.r_to_r.apply(r_1)) # query: batch_size x q x feature_dim # r: q x batch_size x feature_dim r, updates = theano.scan(fn=one_step, sequences=[query.swapaxes(0,1)], outputs_info=T.zeros_like(doc[:, 0, :]), non_sequences=[doc, att1], n_steps=query.shape[1], name='impatient layer') # for the sequence encoder # q x batch_size x output_dim Wr = self.seq_embed.apply(r) # q x batch_size x output_dim seq_r, garbage = self.seq.apply(Wr) # batch_size x feature_dim r_q = r[mask, T.arange(batch_size), :] seq_r_q = seq_r[mask, T.arange(batch_size), :] # batch_size x output_dim return r_q, seq_r_q
def main(model_path, recurrent_type): dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, _make_target, add_sources=('target',)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(100)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') target = tensor.lmatrix('target') target_mask = tensor.matrix('target_mask') dim = 100 lookup = LookupTable(len(all_chars), dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) if recurrent_type == 'lstm': rnn = LSTM(dim / 4, Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) elif recurrent_type == 'simple': rnn = SimpleRecurrent(dim, Tanh()) rnn = Bidirectional(rnn, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) else: raise ValueError('Not known RNN type') rnn.initialize() lookup.initialize() y_hat = rnn.apply(lookup.apply(features), mask=features_mask) print len(all_chars) linear = Linear(2 * dim, len(all_chars), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) linear.initialize() y_hat = linear.apply(y_hat) seq_lenght = y_hat.shape[0] batch_size = y_hat.shape[1] y_hat = Softmax().apply(y_hat.reshape((seq_lenght * batch_size, -1))).reshape(y_hat.shape) cost = CategoricalCrossEntropy().apply( target.flatten(), y_hat.reshape((-1, len(all_chars)))) * seq_lenght * batch_size cost.name = 'cost' cost_per_character = cost / features_mask.sum() cost_per_character.name = 'cost_per_character' cg = ComputationGraph([cost, cost_per_character]) model = Model(cost) algorithm = GradientDescent(step_rule=Adam(), cost=cost, params=cg.parameters) train_monitor = TrainingDataMonitoring( [cost, cost_per_character], prefix='train', after_batch=True) extensions = [train_monitor, Printing(every_n_batches=40), Dump(model_path, every_n_batches=200), #Checkpoint('rnn.pkl', every_n_batches=200) ] main_loop = MainLoop(model=model, algorithm=algorithm, data_stream=data_stream, extensions=extensions) main_loop.run()
def build_theano_functions(self): x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor4('y') layers_input = [x] dims = np.array([self.time_dim]) for dim in self.lstm_layers_dim: dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)): # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear( dims[layer], dims[layer + 1] * 4, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear" + str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM(dim=dims[layer + 1], weights_init=IsotropicGaussian(mean=0., std=0.5), biases_init=Constant(1), name="lstm" + str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale * Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear( dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=0., std=1), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1: print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else: y_hat = output_transform.apply( T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor #pis = T.reshape( # T.nnet.softmax( # T.nnet.sigmoid( # T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim)))), # (self.batch_dim, self.time_dim, self.gmm_dim)) pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:, :, :self.gmm_dim], (self.sequence_dim * self.batch_dim, self.gmm_dim))), (self.batch_dim, self.sequence_dim, self.gmm_dim)) sig = T.exp(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 1e-6 #sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1 #mus = 2.*T.tanh(y_hat[:,:,self.gmm_dim*2:]) mus = y_hat[:, :, self.gmm_dim * 2:] pis = pis[:, :, :, np.newaxis] mus = mus[:, :, :, np.newaxis] sig = sig[:, :, :, np.newaxis] #y = y[:,:,np.newaxis,:] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5 * ((y - mus)**2) / sig**2 coeff = T.log(T.maximum(1. / (T.sqrt(2. * np.pi) * sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum( sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum( T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True)) ).mean() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)): #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append( tuple([parameters[i], parameters[i] - lr * grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug: gradf = theano.function([x, y, lr], [LL, pis, mus, sig], updates=updates) else: #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr], [LL], updates=updates) f = theano.function([x], [pis, sig, mus]) return gradf, f
def main(): x = T.tensor3('features') m = T.matrix('features_mask') y = T.imatrix('targets') #rnn = SimpleRecurrent( #dim = 50, #activation=Tanh(), #weights_init = Uniform(std=0.01), #biases_init = Constant(0.) #) #rnn = GatedRecurrent( #dim = 50, #activation=Tanh(), #weights_init = Uniform(std=0.01), #biases_init = Constant(0.) #) embedding_size = 300 #glove_version = "vectors.6B.100d.txt" glove_version = "glove.6B.300d.txt" #fork = Fork(weights_init=IsotropicGaussian(0.02), #biases_init=Constant(0.), #input_dim=embedding_size, #output_dims=[embedding_size]*3, #output_names=['inputs', 'reset_inputs', 'update_inputs'] #) rnn = LSTM( dim = embedding_size, activation=Tanh(), weights_init = IsotropicGaussian(std=0.02), ) rnn.initialize() #fork.initialize() wstd = 0.02 score_layer = Linear( input_dim = 128, output_dim = 1, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(0.), name="linear2") score_layer.initialize() gloveMapping = Linear( input_dim = embedding_size, output_dim = embedding_size, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(0.0), name="gloveMapping" ) gloveMapping.initialize() o = gloveMapping.apply(x) o = Rectifier(name="rectivfyglove").apply(o) forget_bias = np.zeros((embedding_size*4), dtype=theano.config.floatX) forget_bias[embedding_size:embedding_size*2] = 4.0 toLSTM = Linear( input_dim = embedding_size, output_dim = embedding_size*4, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(forget_bias), #biases_init = Constant(0.0), name="ToLSTM" ) toLSTM.initialize() rnn_states, rnn_cells = rnn.apply(toLSTM.apply(o) * T.shape_padright(m), mask=m) #inputs, reset_inputs, update_inputs = fork.apply(x) #rnn_states = rnn.apply(inputs=inputs, reset_inputs=reset_inputs, update_inputs=update_inputs, mask=m) #rnn_out = rnn_states[:, -1, :] rnn_out = (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1) / m.sum(axis=1).dimshuffle(0, 'x') #rnn_out = (rnn_states).mean(axis=1)# / m.sum(axis=1) hidden = Linear( input_dim = embedding_size, output_dim = 128, weights_init = Uniform(std=0.01), biases_init = Constant(0.)) hidden.initialize() o = hidden.apply(rnn_out) o = Rectifier().apply(o) hidden = Linear( input_dim = 128, output_dim = 128, weights_init = IsotropicGaussian(std=0.02), biases_init = Constant(0.), name="hiddenmap2") hidden.initialize() o = hidden.apply(o) o = Rectifier(name="rec2").apply(o) o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' #print (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1).shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).sum(axis=1).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #raw_input() # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost = cost, params=params, step_rule = CompositeRule([ StepClipping(threshold=10), AdaM(), #AdaDelta(), ]) ) # ======== print "setting up data" train_dataset = IMDBText('train') test_dataset = IMDBText('test') batch_size = 16 n_train = train_dataset.num_examples train_stream = DataStream( dataset=train_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) glove = GloveTransformer(glove_version, data_stream=train_stream) train_padded = Padding( data_stream=glove, mask_sources=('features',) #mask_sources=[] ) test_stream = DataStream( dataset=test_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) glove = GloveTransformer(glove_version, data_stream=test_stream) test_padded = Padding( data_stream=glove, mask_sources=('features',) #mask_sources=[] ) print "setting up model" #import ipdb #ipdb.set_trace() lstm_norm = rnn.W_state.norm(2) lstm_norm.name = "lstm_norm" pre_norm= gloveMapping.W.norm(2) pre_norm.name = "pre_norm" #====== model = Model(cost) extensions = [] extensions.append(EpochProgress(batch_per_epoch=train_dataset.num_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring( [cost, misclassification, lstm_norm, pre_norm], prefix='train', after_epoch=True )) extensions.append(DataStreamMonitoring( [cost, misclassification], data_stream=test_padded, prefix='test', after_epoch=True )) extensions.append(Timing()) extensions.append(Printing()) extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True)) extensions.append(Plot("result", channels=[['train_cost', 'train_misclassification']], after_epoch=True)) main_loop = MainLoop( model=model, data_stream=train_padded, algorithm=algorithm, extensions=extensions) main_loop.run()
def rf_lstm_experiment(data_name, exp_network, in_dim, out_dim, num_layers, start_neurons, num_neurons, batch_size, num_epochs): """LSTM Experiment.""" # load dataset train_set = IterableDataset( ds.transform_sequence(data_name, "train", batch_size)) test_set = IterableDataset( ds.transform_sequence(data_name, "test", batch_size)) stream_train = DataStream(dataset=train_set) stream_test = DataStream(dataset=test_set) methods = ['sgd', 'momentum', 'adagrad', 'rmsprop'] for n_layers in xrange(1, num_layers + 1): for n_neurons in xrange(start_neurons, num_neurons + 5, 5): for method in methods: X = T.tensor3("features") y = T.matrix("targets") x_to_h = Linear(in_dim, n_neurons * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(n_neurons, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = nc.setup_ff_network(n_neurons, out_dim, n_layers - 1, n_neurons) X_trans = x_to_h.apply(X) h, c = lstm.apply(X_trans) y_hat = h_to_o.apply(h[-1]) cost, cg = nc.create_cg_and_cost(y, y_hat, "none") lstm.initialize() x_to_h.initialize() h_to_o.initialize() algorithm = nc.setup_algorithms(cost, cg, method, type="RNN") test_monitor = DataStreamMonitoring(variables=[cost], data_stream=stream_test, prefix="test") train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) main_loop = MainLoop( algorithm=algorithm, data_stream=stream_train, extensions=[ test_monitor, train_monitor, FinishAfter(after_n_epochs=num_epochs), Printing(), ProgressBar() ]) main_loop.run() # Saving results exp_id = ds.create_exp_id(exp_network, n_layers, n_neurons, batch_size, num_epochs, method, "none") # prepare related functions predict = theano.function([X], y_hat) # prepare related data train_features, train_targets = ds.get_iter_data(train_set) test_features, test_targets = ds.get_iter_data(test_set) # Prediction of result train_predicted = gen_prediction(predict, train_features) test_predicted = gen_prediction(predict, test_features) # Get cost cost = ds.get_cost_data(test_monitor, train_set.num_examples, num_epochs) # logging ds.save_experiment(train_targets, train_predicted, test_targets, test_predicted, cost, exp_network, n_layers, n_neurons, batch_size, num_epochs, method, "none", exp_id, "../results/")
def build_theano_functions(self) : # shape of theano inpu is time+1 X features x = T.fmatrix('frequency_sequence') x = x.reshape((self.batch_dim, self.time_dim+1, self.input_dim)) y = x[:,1:self.time_dim+1,:] x = x[:,:self.time_dim,:] layers_input = [x] dims =np.array([self.input_dim]) for dim in self.lstm_layers_dim : dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)) : # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer+1]*4, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear"+str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM( dim=dims[layer+1], weights_init=IsotropicGaussian(mean=0.,std=0.5), biases_init=Constant(1), name="lstm"+str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # the idea is to have one gaussian parametrize every frequency bin print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, #weights_init=IsotropicGaussian(mean=0., std=1), weights_init=Orthogonal(self.orth_scale), biases_init=Constant(0), #use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1 : print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else : y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2)) sig = T.nnet.relu(y_hat[:,:,:self.output_dim/2])+0.05 mus = y_hat[:,:,self.output_dim/2:] # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time inside_expo = -0.5*((y-mus)**2)/sig**2 expo = T.exp(inside_expo) coeff = 1./(T.sqrt(2.*np.pi)*sig) inside_log = T.log(coeff*expo) inside_log_max = T.max(inside_log, axis=2, keepdims=True) LL = -(inside_log_max + T.log(T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum() LL.name = "summed_likelihood" model = Model(LL) self.model = model algorithm = GradientDescent( cost=LL, parameters=model.parameters, step_rule=AdaGrad()) f = theano.function([x],[sig, mus]) return algorithm, f
class videoAttentionLayer: # both visual and word feature are in the joint space # of dim: feature_dim # hidden_dim: dim of m # output_dim: final joint document query representation dim def __init__(self, feature_dim, hidden_dim, output_dim): self.image_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='image_embed') self.word_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='word_embed') self.r_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='r_embed') self.m_to_s = Linear(input_dim=hidden_dim, output_dim=1, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='m_to_s') self.attention_dist = Softmax(name='attention_dist_softmax') self.r_to_r = Linear(input_dim=feature_dim, output_dim=feature_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='r_to_r') # self.r_to_g = Linear(input_dim=feature_dim, # output_dim=output_dim, # weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0), # use_bias=False, # name='r_to_g') self.image_embed.initialize() self.word_embed.initialize() self.r_embed.initialize() self.m_to_s.initialize() self.r_to_r.initialize() # self.r_to_g.initialize() # the sequence to sequence LSTM self.seq = LSTM(output_dim, name='rewatcher_seq', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.seq_embed = Linear(feature_dim, output_dim * 4, name='rewatcher_seq_embed', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False) self.seq.initialize() self.seq_embed.initialize() # doc: row major batch_size x doc_length x feature_dim # query: row major batch_size x feature_dim # mask: mask of query batch_size # mask: length of a sentence - 1 def apply(self, doc, query, mask_, batch_size): # batch_size x doc_length x hidden_dim mask = mask_.flatten() att1 = self.image_embed.apply(doc) # y_q_i: the ith token of question # batch_size x feature_dim # r_1: r_m_1 # batch_size x feature_dim # y_d: document # batch_size x doc_length x feature_dim # y_d_m: d-to-m # batch_size x doc_length x hidden_dim # batch_size x hidden_dim # batch_size x hidden_dim y_d = doc att3 = self.word_embed.apply(query) att = att1 + att3.dimshuffle(0, 'x', 1) # batch_size x doc_length x hidden_dim m = T.tanh(att) # batch_size x doc_length x 1 s = self.m_to_s.apply(m) # batch_size x doc_length s = s.reshape((s.shape[0], s.shape[1])) s = self.attention_dist.apply(s) y_d_s = y_d.swapaxes(1, 2) # return batch_size x feature_dim r = T.batched_dot(y_d_s, s) # batch_size x output_dim return r
class questionAttentionLayer: def __init__(self, feature_dim, hidden_dim, output_dim): self.image_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_image_embed') self.word_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_word_embed') self.r_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_r_embed') self.m_to_s = Linear(input_dim=hidden_dim, output_dim=1, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_m_to_s') self.attention_dist = Softmax(name='iw_attetion') self.r_to_r = Linear(input_dim=feature_dim, output_dim=feature_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_r_to_r') # self.r_to_g = Linear(input_dim=feature_dim, # output_dim=output_dim, # weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0), # use_bias=False, # name='iw_r_to_g') self.image_embed.initialize() self.word_embed.initialize() self.r_embed.initialize() self.m_to_s.initialize() self.r_to_r.initialize() # self.r_to_g.initialize() self.seq = LSTM(feature_dim, name='rereader_seq', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.seq_embed = Linear(feature_dim, output_dim * 4, name='rereader_seq_embed', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False) self.seq.initialize() self.seq_embed.initialize() # video: batch_size x video_length x feature_dim # query: batch_size x q x feature_dim # mask: this mask is different from other masks # batch_size x q # eg. # -10000 == -np.Inf # 1: 0, 0, 0, 0, 0, -10000, -10000, -10000 # 2: 0, 0, 0, 0, -10000, -10000, -10000 # 3: 0, 0, 0, 0, 0, 0, 0, -10000 def apply(self, video, query, mask, batch_size): # batch_size x q x hidden_dim att1 = self.word_embed.apply(query) # batch_size x hidden_dim y_q = query att3 = self.image_embed.apply(video) att = att1 + att3.dimshuffle(0, 'x', 1) # batch_size x q x hidden_dim m = T.tanh(att) # batch_size x q s = self.m_to_s.apply(m) s = s.reshape((s.shape[0], s.shape[1])) # ignore the question padding 0s s = s + mask s = self.attention_dist.apply(s) y_q_s = y_q.swapaxes(1, 2) r = T.batched_dot(y_q_s, s) # batch_size x feature_dim return r
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') x_int = x.astype(dtype='int32').T train_dataset = TextFile('inspirational.txt') train_dataset.indexables[0] = numpy.array(sorted( train_dataset.indexables[0], key=len )) n_voc = len(train_dataset.dict.keys()) init_probs = numpy.array( [sum(filter(lambda idx:idx == w, [s[0] for s in train_dataset.indexables[ train_dataset.sources.index('features')]] )) for w in xrange(n_voc)], dtype=theano.config.floatX ) init_probs = init_probs / init_probs.sum() n_h = 100 linear_embedding = LookupTable( length=n_voc, dim=4*n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() rnn = LSTM( dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim=n_h, output_dim=n_voc, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = (linear_embedding.apply(x_int[:-1]) * tensor.shape_padright(m.T[1:])) rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:]) probs = softmax( sequence_map(score_layer.apply, rnn_out[0], mask=m.T[1:])[0] ) idx_mask = m.T[1:].nonzero() cost = CategoricalCrossEntropy().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) cost.name = 'cost' misclassification = MisclassificationRate().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule( [StepClipping(10.), Adam()]) ) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=train_dataset.num_examples, batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) batch_size = 10 length = 30 trng = MRG_RandomStreams(18032015) u = trng.uniform(size=(length, batch_size, n_voc)) gumbel_noise = -tensor.log(-tensor.log(u)) init_samples = (tensor.log(init_probs).dimshuffle(('x', 0)) + gumbel_noise[0]).argmax(axis=-1) init_states = rnn.initial_state('states', batch_size) init_cells = rnn.initial_state('cells', batch_size) def sampling_step(g_noise, states, cells, samples_step): embedding_step = linear_embedding.apply(samples_step) next_states, next_cells = rnn.apply(inputs=embedding_step, states=states, cells=cells, iterate=False) probs_step = softmax(score_layer.apply(next_states)) next_samples = (tensor.log(probs_step) + g_noise).argmax(axis=-1) return next_states, next_cells, next_samples [_, _, samples], _ = theano.scan( fn=sampling_step, sequences=[gumbel_noise[1:]], outputs_info=[init_states, init_cells, init_samples] ) sampling = theano.function([], samples.owner.inputs[0].T) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification']], titles=['Costs'])) extensions.append(PlotManager('Language modelling example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) extensions.append(PrintSamples(sampler=sampling, voc=train_dataset.inv_dict)) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def build_theano_functions(self): x = T.fmatrix('time_sequence') x = x.reshape((self.batch_dim, self.sequence_dim, self.time_dim)) y = x[:,1:self.sequence_dim,:] x = x[:,:self.sequence_dim-1,:] # if we try to include the spectrogram features spec_dims = 0 if self.image_size is not None : print "Convolution activated" self.init_conv() spec = T.ftensor4('spectrogram') spec_features, spec_dims = self.conv.build_conv_layers(spec) print "Conv final dims =", spec_dims spec_dims = np.prod(spec_dims) spec_features = spec_features.reshape( (self.batch_dim, self.sequence_dim-1, spec_dims)) x = T.concatenate([x, spec_features], axis=2) layers_input = [x] dims =np.array([self.time_dim + spec_dims]) for dim in self.lstm_layers_dim : dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)) : # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer+1]*4, weights_init=Orthogonal(self.orth_scale), biases_init=Constant(0), name="linear"+str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X sequence X time lstm = LSTM( dim=dims[layer+1], weights_init=IsotropicGaussian(mean=0.,std=0.5), biases_init=Constant(1), name="lstm"+str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1 : print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else : y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:,:,:self.gmm_dim], ((self.sequence_dim-1)*self.batch_dim, self.gmm_dim))), (self.batch_dim, (self.sequence_dim-1), self.gmm_dim)) sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+1e-6 mus = y_hat[:,:,self.gmm_dim*2:] pis = pis[:,:,:,np.newaxis] mus = mus[:,:,:,np.newaxis] sig = sig[:,:,:,np.newaxis] y = y[:,:,np.newaxis,:] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5*((y-mus)**2)/sig**2 coeff = T.log(T.maximum(1./(T.sqrt(2.*np.pi)*sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum(sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum(T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))).mean() LL.name = "summed_likelihood" model = Model(LL) self.model = model parameters = model.parameters algorithm = GradientDescent( cost=LL, parameters=model.parameters, step_rule=Adam()) f = theano.function([x],[pis, sig, mus]) return algorithm, f
def train(self): x = self.sharedBatch['x'] x.name = 'x_myinput' x_mask = self.sharedBatch['x_mask'] x_mask.name = 'x_mask_myinput' y = self.sharedBatch['y'] y.name = 'y_myinput' if self.usePro: proportion = self.sharedBatch['pro'] proportion.name = 'pro' # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(self.input_dimx1, self.dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(self.dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(self.dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) h, c = lstm.apply(x_transform, mask=x_mask) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) y_hat = Logistic().apply(y_hat) if self.usePro: cost = BinaryCrossEntropyProp().apply(y, y_hat, proportion) else: cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstm.initialize() x_to_h.initialize() h_to_o.initialize() self.f = theano.function(inputs=[], outputs=y_hat) self.lastH = theano.function(inputs=[], outputs=h[-1]) self.cg = ComputationGraph(cost) m = Model(cost) algorithm = GradientDescent(cost=cost, parameters=self.cg.parameters, step_rule=RMSProp(learning_rate=0.01), on_unused_sources='ignore') valid_monitor = DataStreamMonitoringShared( variables=[cost], data_stream=self.stream_valid_int, prefix="valid", sharedBatch=self.sharedBatch, sharedData=self.sharedData) train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) sharedVarMonitor = SwitchSharedReferences(self.sharedBatch, self.sharedData) tBest = self.track_best('valid_cost', self.cg) self.tracker = tBest[0] extensions = [sharedVarMonitor, valid_monitor] + tBest if self.debug: extensions.append(Printing()) self.algorithm = algorithm self.extensions = extensions self.model = m self.mainloop = MainLoop(self.algorithm, self.stream_train_int, extensions=self.extensions, model=self.model) self.main_loop(True)
class TestLSTM(unittest.TestCase): def setUp(self): self.lstm = LSTM(dim=3, weights_init=Constant(2), biases_init=Constant(0)) self.lstm.initialize() def test_one_step(self): h0 = tensor.matrix('h0') c0 = tensor.matrix('c0') x = tensor.matrix('x') h1, c1 = self.lstm.apply(x, h0, c0, iterate=False) next_h = theano.function(inputs=[x, h0, c0], outputs=[h1]) h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=theano.config.floatX) c0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=theano.config.floatX) x_val = 0.1 * numpy.array([range(12), range(12, 24)], dtype=theano.config.floatX) W_state_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX) W_cell_to_in = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_out = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_forget = 2 * numpy.ones((3,), dtype=theano.config.floatX) # omitting biases because they are zero activation = numpy.dot(h0_val, W_state_val) + x_val def sigmoid(x): return 1. / (1. + numpy.exp(-x)) i_t = sigmoid(activation[:, :3] + c0_val * W_cell_to_in) f_t = sigmoid(activation[:, 3:6] + c0_val * W_cell_to_forget) next_cells = f_t * c0_val + i_t * numpy.tanh(activation[:, 6:9]) o_t = sigmoid(activation[:, 9:12] + next_cells * W_cell_to_out) h1_val = o_t * numpy.tanh(next_cells) assert_allclose(h1_val, next_h(x_val, h0_val, c0_val)[0], rtol=1e-6) def test_many_steps(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') h, c = self.lstm.apply(x, mask=mask, iterate=True) calc_h = theano.function(inputs=[x, mask], outputs=[h]) x_val = (0.1 * numpy.asarray( list(itertools.islice(itertools.permutations(range(12)), 0, 24)), dtype=theano.config.floatX)) x_val = numpy.ones((24, 4, 12), dtype=theano.config.floatX) * x_val[:, None, :] mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX) c_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX) W_state_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX) W_cell_to_in = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_out = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_forget = 2 * numpy.ones((3,), dtype=theano.config.floatX) def sigmoid(x): return 1. / (1. + numpy.exp(-x)) for i in range(1, 25): activation = numpy.dot(h_val[i-1], W_state_val) + x_val[i-1] i_t = sigmoid(activation[:, :3] + c_val[i-1] * W_cell_to_in) f_t = sigmoid(activation[:, 3:6] + c_val[i-1] * W_cell_to_forget) c_val[i] = f_t * c_val[i-1] + i_t * numpy.tanh(activation[:, 6:9]) o_t = sigmoid(activation[:, 9:12] + c_val[i] * W_cell_to_out) h_val[i] = o_t * numpy.tanh(c_val[i]) h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) c_val[i] = (mask_val[i - 1, :, None] * c_val[i] + (1 - mask_val[i - 1, :, None]) * c_val[i - 1]) h_val = h_val[1:] assert_allclose(h_val, calc_h(x_val, mask_val)[0], rtol=1e-04) # Also test that initial state is a parameter initial1, initial2 = VariableFilter(roles=[INITIAL_STATE])( ComputationGraph(h)) assert is_shared_variable(initial1) assert is_shared_variable(initial2) assert {initial1.name, initial2.name} == { 'initial_state', 'initial_cells'}
def main(): x = T.imatrix('features') m = T.matrix('features_mask') y = T.imatrix('targets') #x_int = x.astype(dtype='int32').T x_int = x.T train_dataset = IMDB('train') n_voc = len(train_dataset.dict.keys()) n_h = 2 lookup = LookupTable(length=n_voc + 2, dim=n_h * 4, weights_init=Uniform(std=0.01), biases_init=Constant(0.)) lookup.initialize() #rnn = SimpleRecurrent( #dim = n_h, #activation=Tanh(), #weights_init = Uniform(std=0.01), #biases_init = Constant(0.) #) rnn = LSTM(dim=n_h, activation=Tanh(), weights_init=Uniform(std=0.01), biases_init=Constant(0.)) rnn.initialize() score_layer = Linear(input_dim=n_h, output_dim=1, weights_init=Uniform(std=0.01), biases_init=Constant(0.)) score_layer.initialize() embedding = lookup.apply(x_int) * T.shape_padright(m.T) #embedding = lookup.apply(x_int) + m.T.mean()*0 #embedding = lookup.apply(x_int) + m.T.mean()*0 rnn_states = rnn.apply(embedding, mask=m.T) #rnn_states, rnn_cells = rnn.apply(embedding) rnn_out_mean_pooled = rnn_states[-1] #rnn_out_mean_pooled = rnn_states.mean() probs = Sigmoid().apply(score_layer.apply(rnn_out_mean_pooled)) cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(threshold=10), Adam(), #AdaDelta(), ])) # ======== test_dataset = IMDB('test') batch_size = 64 n_train = train_dataset.num_examples train_stream = DataStream(dataset=train_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size)) train_padded = Padding(data_stream=train_stream, mask_sources=('features', ) #mask_sources=[] ) test_stream = DataStream(dataset=test_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size)) test_padded = Padding(data_stream=test_stream, mask_sources=('features', ) #mask_sources=[] ) #import ipdb #ipdb.set_trace() #====== model = Model(cost) extensions = [] extensions.append( EpochProgress( batch_per_epoch=train_dataset.num_examples // batch_size + 1)) extensions.append( TrainingDataMonitoring([cost, misclassification], prefix='train', after_epoch=True)) extensions.append( DataStreamMonitoring([cost, misclassification], data_stream=test_padded, prefix='test', after_epoch=True)) extensions.append(Timing()) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_padded, algorithm=algorithm, extensions=extensions) main_loop.run()