def __init__(self,batch_size=16, seed=1234,nhu=300,width=5,n_out=len(nerarray),activation_f="hardtanh", embeddingfile=senna_embmtxfile,trainingfile=trainingfile,paramfile=None): modeldir=os.path.join(nerdir,"models",'model_%i'%(len(os.listdir(nerdir+"/models")))) os.mkdir(modeldir) for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=os.path.join(modeldir,'log.txt'), level=logging.INFO, format='%(asctime)s : %(levelname)s : %(message)s') logger.info("\n"+"\n".join(["\t%s : "%key+str(val) for key,val in locals().iteritems() if key!="self"])) self.modeldir=modeldir self.batch_size = batch_size activation=None if activation_f=="hardtanh": activation=hardtanh elif activation_f=="tanh": activation=T.tanh self.load_data(embeddingfile,trainingfile,batch_size) #============================================================================== # BUILD MODEL #============================================================================== logger.info('... building the model') # allocate symbolic variables for the data self.index = T.iscalar() # index to a [mini]batch self.x = T.itensor3('x') # the data is presented as matrix of integers self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels self.permutation = T.ivector('permutation') if paramfile!=None: params=pickle.load(open(paramfile,"rb")) else: params=None self.model = SennaNER(input=self.x, embeddings=self.embeddings,features=capsfeatures,n_out=n_out, mini_batch_size=batch_size, nhu=nhu,width=width,activation=activation,seed=seed,params=params) self.test_model = theano.function(inputs=[self.index], outputs=self.model.errors(self.y), givens={ self.x: self.test_set_x[self.index * batch_size:(self.index + 1) * batch_size], self.y: self.test_set_y[self.index * batch_size:(self.index + 1) * batch_size]}, name="test_model") self.validation_cost = theano.function(inputs=[self.index], outputs=self.model.negative_log_likelihood(self.y), givens={ self.x: self.valid_set_x[self.index * batch_size:(self.index + 1) * batch_size], self.y: self.valid_set_y[self.index * batch_size:(self.index + 1) * batch_size]}, name="validation_cost") self.predictions = theano.function(inputs=[self.index], outputs=self.model.predictions, givens={ self.x: self.test_set_x[self.index * batch_size:(self.index + 1) * batch_size]}, name="predictions") self.visualize_hidden = theano.function(inputs=[self.index], outputs=self.model.HiddenLayer.output, givens={ self.x: self.valid_set_x[self.index * batch_size:(self.index + 1) * batch_size]}, name="visualize_hidden")
class Trainer(): def __init__(self,batch_size=16, seed=1234,nhu=300,width=5,n_out=len(nerarray),activation_f="hardtanh", embeddingfile=senna_embmtxfile,trainingfile=trainingfile,paramfile=None): modeldir=os.path.join(nerdir,"models",'model_%i'%(len(os.listdir(nerdir+"/models")))) os.mkdir(modeldir) for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=os.path.join(modeldir,'log.txt'), level=logging.INFO, format='%(asctime)s : %(levelname)s : %(message)s') logger.info("\n"+"\n".join(["\t%s : "%key+str(val) for key,val in locals().iteritems() if key!="self"])) self.modeldir=modeldir self.batch_size = batch_size activation=None if activation_f=="hardtanh": activation=hardtanh elif activation_f=="tanh": activation=T.tanh self.load_data(embeddingfile,trainingfile,batch_size) #============================================================================== # BUILD MODEL #============================================================================== logger.info('... building the model') # allocate symbolic variables for the data self.index = T.iscalar() # index to a [mini]batch self.x = T.itensor3('x') # the data is presented as matrix of integers self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels self.permutation = T.ivector('permutation') if paramfile!=None: params=pickle.load(open(paramfile,"rb")) else: params=None self.model = SennaNER(input=self.x, embeddings=self.embeddings,features=capsfeatures,n_out=n_out, mini_batch_size=batch_size, nhu=nhu,width=width,activation=activation,seed=seed,params=params) self.test_model = theano.function(inputs=[self.index], outputs=self.model.errors(self.y), givens={ self.x: self.test_set_x[self.index * batch_size:(self.index + 1) * batch_size], self.y: self.test_set_y[self.index * batch_size:(self.index + 1) * batch_size]}, name="test_model") self.validation_cost = theano.function(inputs=[self.index], outputs=self.model.negative_log_likelihood(self.y), givens={ self.x: self.valid_set_x[self.index * batch_size:(self.index + 1) * batch_size], self.y: self.valid_set_y[self.index * batch_size:(self.index + 1) * batch_size]}, name="validation_cost") self.predictions = theano.function(inputs=[self.index], outputs=self.model.predictions, givens={ self.x: self.test_set_x[self.index * batch_size:(self.index + 1) * batch_size]}, name="predictions") self.visualize_hidden = theano.function(inputs=[self.index], outputs=self.model.HiddenLayer.output, givens={ self.x: self.valid_set_x[self.index * batch_size:(self.index + 1) * batch_size]}, name="visualize_hidden") def load_data(self,embeddingsfile,dataset,batch_size): logger.info('... loading data') self.embeddings=np.load(embeddingsfile) rng=np.random.RandomState(1234) self.capsfeatures=np.asarray(rng.uniform( low=-np.sqrt(1. / (1)), high=np.sqrt(1. / (1)), # low=-np.sqrt(3. / 1), # high=np.sqrt(3. / 1), size=(4,5)), dtype=theano.config.floatX) train_set, valid_set, test_set = np.load(dataset) self.n_train_batches = train_set[0].shape[0] / batch_size self.n_valid_batches = valid_set[0].shape[0] / batch_size self.n_test_batches = test_set[0].shape[0] / batch_size self.train_set_size = train_set[0].shape[0] def shared_dataset(data_xy, borrow=True): data_x, data_y = data_xy shared_x = theano.shared(np.asarray(data_x, dtype='int32'), borrow=borrow) shared_y = theano.shared(np.asarray(data_y, dtype='int32'),#dtype=theano.config.floatX), borrow=borrow) return shared_x, shared_y#T.cast(shared_y, 'int32') self.test_set_x, self.test_set_y = shared_dataset(test_set) self.valid_set_x, self.valid_set_y = shared_dataset(valid_set) self.train_set_x, self.train_set_y = shared_dataset(train_set) def train_model(self, lr_scheme,initial_learning_rate=0.01, min_lr=0.00001,learning_rate_decay=0.05,constant_steps=None,L1_reg=0.0000, L2_reg=0.0000,lr_global=False, n_epochs=100,momentum_term=0.9): logger.info("\n"+"\n".join(["\t%s : "%key+str(locals()[key]) for key in ["lr_scheme","lr_global","min_lr","initial_learning_rate","learning_rate_decay","L1_reg","L2_reg","n_epochs"]])) cost = self.model.negative_log_likelihood(self.y) \ + L2_reg * self.model.L2 #\ # + L1_reg * self.model.L1 self.learning_rate = theano.shared(np.float32(initial_learning_rate)) if constant_steps==None: self.constant_steps = np.inf else: self.constant_steps = constant_steps self.lr_scheme = lr_scheme def gen_updates_sgd(): gparams = [theano.grad(cost, param) for param in self.model.params] updates = [] for param_i, grad_i, n_in in zip(self.model.params, gparams, self.model.n_ins): if "embeddings" not in str(param_i): updates.append((param_i, param_i - self.learning_rate/n_in * grad_i)) else: updates.append((param_i, param_i - self.learning_rate * grad_i)) return updates def gen_updates_sgd_global(): gparams = [theano.grad(cost, param) for param in self.model.params] updates = [] for param_i, grad_i in zip(self.model.params, gparams): updates.append((param_i, param_i - self.learning_rate * grad_i)) return updates # def gen_updates_regular_momentum(loss, all_parameters, learning_rate, momentum, weight_decay): # all_grads = [theano.grad(loss, param) for param in all_parameters] # updates = [] # for param_i, grad_i in zip(all_parameters, all_grads): # mparam_i = theano.shared(param_i.get_value()*0.) # v = momentum * mparam_i - weight_decay * learning_rate * param_i - learning_rate * grad_i # updates.append((mparam_i, v)) # updates.append((param_i, param_i + v)) # return updates # # def gen_updates_own_momentum(): # agparams=[theano.shared(value=np.zeros(p.get_value().shape, dtype=theano.config.floatX), name='ag_'+p.name) \ # for p in self.model.params] # averaged gradients # gparams = [] # gradients # for pid,param in enumerate(self.model.params): # gparam = T.grad(cost, param) # gparams.append(gparam) # updates = [] # for param, gparam, agparam, n_in in zip(self.model.params, gparams, agparams, self.model.n_ins): # updates.append((agparam,np.float32(1-momentum_term)*agparam + np.float32(momentum_term)*gparam)) # if lr_global: # updates.append((param, param - self.learning_rate/n_in * (np.float32(1-momentum_term)*agparam + np.float32(momentum_term)*gparam))) # else: # updates.append((param, param - self.learning_rate * (np.float32(1-momentum_term)*agparam + np.float32(momentum_term)*gparam))) # return updates if lr_global: updates = gen_updates_sgd_global() else: updates = gen_updates_sgd() train_model = theano.function(inputs=[self.index,self.permutation], outputs=theano.Out(cost, borrow=True), updates=updates, givens={ self.x: self.train_set_x[self.permutation[self.index * self.batch_size:(self.index + 1) * self.batch_size]], self.y: self.train_set_y[self.permutation[self.index * self.batch_size:(self.index + 1) * self.batch_size]]}, name="train_model") #============================================================================== # train model #============================================================================== theano.printing.pydotprint(train_model) logger.info('... training') min_valid_cost = np.inf best_epoch = 0 test_score = 0. start_time = time.clock() epoch = 0 self.trainingscosts=[] self.validationcosts=[] training_costs=[10] while (epoch <= n_epochs): self.trainingscosts.append(np.mean(training_costs)) validation_costs = [self.validation_cost(i) for i in xrange(self.n_valid_batches)] self.validationcosts.append(np.mean(validation_costs)) self.monitor_update() if self.validationcosts[-1]<min_valid_cost: min_valid_cost=self.validationcosts[-1] best_epoch=epoch self.test_error(epoch) if epoch%25==0: pickle.dump(self.model,open(os.path.join(self.modeldir,'model%i.pck'%epoch),'wb'),protocol=pickle.HIGHEST_PROTOCOL) hidden_values = [self.visualize_hidden(i) for i in np.random.randint(0,self.n_valid_batches,30)] image = np.vstack(hidden_values) binary_image = (image>0.999) | (image<-0.999) plt.imshow(binary_image,cmap=plt.cm.get_cmap('gray'), interpolation='nearest') plt.savefig(os.path.join(self.modeldir,'binary_hidden%i.png'%epoch)) plt.clf() test_predictions = [self.predictions(i) for i in xrange(self.n_test_batches)] np.save(os.path.join(self.modeldir,"predictions.npy"),test_predictions) generate_output(self.modeldir,modelnumber=epoch, predictions=np.array(test_predictions)) training_costs=[] perm=np.random.permutation(self.train_set_size).astype(np.int32) for minibatch_index in xrange(self.n_train_batches): training_costs.append(train_model(minibatch_index,perm)) if epoch>0: if self.lr_scheme!="constant": if self.lr_scheme=="continuous" and epoch>self.constant_steps: self.learning_rate.set_value(np.float32(initial_learning_rate*(1+learning_rate_decay* self.constant_steps)/(1+learning_rate_decay*max(epoch,self.constant_steps)))) elif ((self.validationcosts[-1]-self.validationcosts[-2])>0 and (self.validationcosts[-1]-np.min(self.validationcosts))>0.01 and \ np.argmin(self.validationcosts)!=(len(self.validationcosts)-2)) or \ (((self.trainingscosts[-1]-self.trainingscosts[-2])>0) and (np.argmin(self.trainingscosts)!=(len(self.trainingscosts)-2))): if self.lr_scheme=="stepwise": self.learning_rate.set_value(np.float32(self.learning_rate.get_value()/3)) elif self.lr_scheme=="continuous": self.constant_steps=epoch-1 self.learning_rate.set_value(np.float32(initial_learning_rate*(1+learning_rate_decay*self.constant_steps)/(1+learning_rate_decay*max(epoch,self.constant_steps)))) if self.learning_rate.get_value()<min_lr: self.learning_rate.set_value(np.float32(min_lr)) self.lr_scheme=="constant" epoch = epoch + 1 end_time = time.clock() logger.info(('Optimization complete. Best validation score of %f %% ' 'obtained at epoch %i, with test performance %f %%') % (min_valid_cost, best_epoch, test_score * 100.)) logger.info('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) self.monitor_update() test_predictions = [self.predictions(i) for i in xrange(self.n_test_batches)] generate_output(self.modeldir,predictions=np.array(test_predictions)) # np.save(os.path.join(self.modeldir,"predictions.npy"),test_predictions) def monitor_update(self): plt.plot(self.trainingscosts,label='training') plt.plot(self.validationcosts,label='validation') plt.legend() plt.savefig(os.path.join(self.modeldir,'costs.png')) plt.clf() logger.info('epoch %i,validation cost %f, training cost %f, lr %f' % (len(self.trainingscosts)-1, self.validationcosts[-1],self.trainingscosts[-1],self.learning_rate.get_value())) def test_error(self,epoch): test_losses = [self.test_model(i) for i in xrange(self.n_test_batches)] test_score = np.mean(test_losses) logger.info((' epoch %i, test error of best model %f %%') % (epoch, test_score * 100.))