def __init__(self, vector_size, voc_size, head_type, head_num, controller_type, controller_sizes, memory_size,shift_width = 3, activation = T.tanh): self.controller = controller_type(controller_sizes) embedding = tools.initial_weights(voc_size[0]+1, vector_size) self.embedding = theano.shared(value = embedding, name = 'embedding', borrow=True) input_w = tools.initial_weights(vector_size, controller_sizes[0]) self.input_w = theano.shared(value = input_w, name = 'input_w', borrow=True) input_b = 0.*tools.initial_weights(controller_sizes[0]) self.input_b = theano.shared(value = input_b, name = 'input_b', borrow=True) read_w = tools.initial_weights(memory_size[1], controller_sizes[0]) self.read_w = theano.shared(value = read_w, name = 'read_w', borrow=True) output_w = tools.initial_weights(controller_sizes[-1], voc_size[1]) self.output_w = theano.shared(value=output_w,name='Controller_outw', borrow=True) output_b = 0.*tools.initial_weights(voc_size[1]) self.output_b = theano.shared(value=output_b,name='Controller_outb', borrow=True) memory_init_p = 2*(numpy.random.rand(memory_size[0],memory_size[1])-0.5) weight_init_p = numpy.random.randn((memory_size[0])) self.memory_init = theano.shared(value = memory_init_p, name = 'memory_init', borrow=True) self.weight_init = theano.shared(value = weight_init_p, name = 'weight_init', borrow=True) self.params = self.controller.params+[self.embedding, self.input_w, self.read_w, self.input_b, self.weight_init,self.memory_init,self.output_w, self.output_b] memory_init = self.memory_init weight_init = tools.vector_softmax(self.weight_init) self.heads = [] for i in xrange(head_num): if head_type == Head_neural: self.heads.append(head_type(controller_sizes[-1], memory_size,i)) else: self.heads.append(head_type(controller_sizes[-1], memory_size, shift_width,i)) self.params += self.heads[i].params print self.params def pred_t(input_voc_t, weight_tm1, memory_tm1): rawinput_t = self.embedding[input_voc_t] input_t = T.dot(rawinput_t,self.input_w) read_m = T.dot(weight_tm1, memory_tm1) read_t = T.dot(read_m,self.read_w) controller_input = activation(input_t+read_t+self.input_b) hid = self.controller.getY(controller_input) output = T.nnet.softmax(T.dot(hid, self.output_w)+self.output_b) result = T.switch(T.eq(input_voc_t, 0),T.argmax(output,axis=1), theano.shared(0)) #test = controller_input memory_inter = memory_tm1 weight_inter = weight_tm1 for head in self.heads: weight_inter, erase, add= head.emit_new_weight(hid, weight_inter, memory_inter) #write to memory weight_tdim = weight_inter.dimshuffle((0, 'x')) erase_dim = erase.dimshuffle(('x', 0)) add_dim = add.dimshuffle(('x', 0)) M_erased = memory_inter*(1-(weight_tdim*erase_dim)) memory_inter = M_erased+(weight_tdim*add_dim) #testing = weight_tm1 #testing2 = rawinput_t memory_t = memory_inter weight_t = weight_inter return weight_t, memory_t, output,result input = T.lvector() output = T.lvector() pred, _ = theano.scan(fn = pred_t, sequences = [input], outputs_info = [weight_init, memory_init, None,None]) p_output = -T.log(pred[-2])[output.shape[0]-1:] #output = output.reshape(output.shape[0],1) def cost_step(po, o,cost_tm1): cost = cost_tm1+po[0][o] return cost cost0 = theano.shared(0.) costs,_ = theano.scan(fn = cost_step, sequences = [p_output, output], outputs_info = [cost0] ) l2 = T.sum(0) for param_i in self.params: l2 = l2+(param_i**2).sum() costs += 1e-4*l2 grads = T.grad(costs[-1], self.params) grads_clip = [T.clip(grad,-100,100) for grad in grads] updates = tools.adadelta(self.params, grads_clip, 0.95, 1e-6) self.predict = theano.function(inputs = [input], outputs =[pred[-1]]) self.train = theano.function(inputs= [input, output], outputs = costs[-1], updates = updates) self.test = theano.function(inputs= [input, output], outputs = costs[-1]) self.getweight = theano.function(inputs = [input], outputs = [pred[0]])
def __init__(self, vector_size, head_type,head_num, controller_type, controller_sizes, memory_size, shift_width = 3, activation = T.tanh): self.lr = 0.01 self.controller = controller_type(controller_sizes) input_w = tools.initial_weights(vector_size, controller_sizes[0]) self.input_w = theano.shared(value = input_w, name = 'input_w', borrow=True) input_b = 0.*tools.initial_weights(controller_sizes[0]) self.input_b = theano.shared(value = input_b, name = 'input_b', borrow=True) read_w = tools.initial_weights(memory_size[1], controller_sizes[0]) self.read_w = theano.shared(value = read_w, name = 'read_w', borrow=True) output_w = tools.initial_weights(controller_sizes[-1], vector_size) self.output_w = theano.shared(value=output_w,name='Controller_outw', borrow=True) output_b = 0.*tools.initial_weights(controller_sizes[-1]) self.output_b = theano.shared(value=output_b,name='Controller_outb', borrow=True) memory_init_p = 2*(numpy.random.rand(memory_size[0],memory_size[1])-0.5) weight_init_p = numpy.random.randn((memory_size[0])) self.memory_init = theano.shared(value = memory_init_p, name = 'memory_init', borrow=True) self.weight_init = theano.shared(value = weight_init_p, name = 'weight_init', borrow=True) self.params = self.controller.params+[self.input_w, self.read_w, self.input_b, self.weight_init,self.memory_init, self.output_w, self.output_b] self.heads = [] for i in xrange(head_num): if head_type == Head_neural: self.heads.append(head_type(controller_sizes[-1], memory_size,i)) else: self.heads.append(head_type(controller_sizes[-1], memory_size, shift_width,i)) self.params += self.heads[i].params #memory_init = tools.initial_weights(memory_size) memory_init = self.memory_init #weight_init_s = T.nnet.sigmoid(self.weight_init) weight_init = tools.vector_softmax(self.weight_init) print self.params #def weighting(weight, value): # return weight*value def pred_t(rawinput_t, weight_tm1, memory_tm1): #memory_tm1 = self #predict the current output input_t = T.dot(rawinput_t,self.input_w) read_m = T.dot(weight_tm1, memory_tm1) read_t = T.dot(read_m,self.read_w) controller_input = activation(input_t+read_t+self.input_b) #zero_vec = theano.shared(value=numpy.zeros((vector_size,))) #mask = T.nonzero(T.eq(rawinput_t,0)) hid = self.controller.getY(controller_input) output = T.nnet.sigmoid(T.dot(hid, self.output_w)+self.output_b) #result = T.switch(T.eq(zero_vec,rawinput_t),output,theano.shared(0)) result = output #testing = T.switch(T.eq(zero_vec,rawinput_t),theano.shared(1),theano.shared(0)) #result = theano.shared(value=numpy.zeros((vector_size,))) #result = read_m #emit the weights memory_inter = memory_tm1 weight_inter = weight_tm1 for head in self.heads: weight_inter, erase, add= head.emit_new_weight(hid, weight_inter, memory_inter) #write to memory weight_tdim = weight_inter.dimshuffle((0, 'x')) erase_dim = erase.dimshuffle(('x', 0)) add_dim = add.dimshuffle(('x', 0)) M_erased = memory_inter*(1-(weight_tdim*erase_dim)) memory_inter = M_erased+(weight_tdim*add_dim) #testing = weight_tm1 #testing2 = rawinput_t memory_t = memory_inter weight_t = weight_inter return weight_t, memory_t, result input = T.matrix() output = T.matrix() seqlength = input.shape[0]/2 #tmp = T.dvector() #testinfo = self.controller.getY(input[1]) #testinfo = input.shape pred, _ = theano.scan(fn = pred_t, sequences = [input], outputs_info = [weight_init, memory_init,None ]) entropy = T.sum(T.nnet.binary_crossentropy(5e-6+(1-1e-5)*pred[-1][seqlength+1:], output[seqlength+1:]),axis = 1) #costs = (pred[-1]-output) ** 2 #cost_sq = T.sum(costs) l2 = T.sum(0) for param_i in self.params: l2 = l2+(param_i**2).sum() #norm = l2 cost = T.sum(entropy) +1e-3*l2 grads = [T.grad(cost, param_i) for param_i in self.params] grads_clip = [T.clip(grad,-10,10) for grad in grads] #params_up = [param_i for param_i, grad_i in zip(self.params, grads_clip)] #new_value = [param_i-self.lr*grad_i for param_i, grad_i in zip(self.params, grads_clip)] #SGD #updates = [(param_i, param_i-self.lr*grad_i) for param_i, grad_i in zip(self.params, grads_clip)] #updates = zip(params_up, new_value) #adadelta updates = tools.adadelta(self.params, grads_clip, 0.95, 1e-6) #updates = tools.adadelta_another(self.params,grads_clip) self.test = theano.function(inputs = [], outputs = l2) self.predict = theano.function(inputs = [input], outputs = pred) self.grads = theano.function(inputs = [input, output], outputs = grads) #self.train = theano.function(inputs = [input,output], outputs = [input,output, costs, cost, pred[0],pred[2],pred[-1],grads[5],grads_clip[5], grads[6]], updates = updates) self.train = theano.function(inputs = [input,output], outputs = cost, updates = updates)#,mode=theano.compile.MonitorMode(post_func=tools.detect_nan))