def minibatches(data, batch_size): print '************************',data[0] x = np.array([d[0] for d in data]) y = np.array([d[2] for d in data]) one_hot = np.zeros((y.size, 3)) one_hot[np.arange(y.size), y] = 1 return get_minibatches([x, one_hot], batch_size)
def run_epoch(self, sess, config, dataset, train_writer, merged): prog = Progbar(target=1 + len(dataset.train_inputs[0]) / config.batch_size) for i, (train_x, train_y) in enumerate(get_minibatches([dataset.train_inputs, dataset.train_targets], config.batch_size, is_multi_feature_input=True)): summary, loss = self.train_on_batch(sess, train_x, train_y, merged) prog.update(i + 1, [("train loss", loss)]) # train_writer.add_summary(summary, global_step=i) return summary, loss # Last batch
def minibatches(data, batch_size): ''' :param data: [([n_feature长的特征], [0 1,1表示可以采取的操作], 真实的操作)..] :param batch_size: .. :return: one-hot编码真实操作作为label ''' x = np.array([d[0] for d in data]) y = np.array([d[2] for d in data]) one_hot = np.zeros((y.size, len(data[0][1]))) one_hot[np.arange(y.size), y] = 1 return get_minibatches([x, one_hot], batch_size)
weight_decay=l2) RL_optimizer = optim.Adam(RL_model.parameters(), lr=args.lr_RL, weight_decay=l2) sentence_reward_noisy = [0 for i in range(args.batchsize)] noisy_sentences_vec = Variable(torch.FloatTensor(1, dim).fill_(0)) for e in range(args.epochRL): print("training epoch ", e) # random.shuffle(train_data) # batchcnt = (len(train_data) - 1) // args.batchsize + 1 # for b in range(batchcnt): # # start = time.time() # datas = train_data[b * args.batchsize: (b + 1) * args.batchsize] mini_batches = get_minibatches(dev_datasets, args.batchsize) batchcnt = len( dev_datasets[0]) // args.batchsize # len(list(mini_batches)) for b, data in enumerate(mini_batches): if b >= batchcnt: break sentences, pos_lambda, tags, sentences_words, relation_tags, relation_names = data input_tensor, input_length = padding_sequence( sentences, pad_token=args.embedding_size) pos_tensor, input_length = padding_sequence(pos_lambda, pad_token=0) target_tensor, target_length = padding_sequence( tags, pad_token=args.entity_tag_size) relation_target_tensor = padding_sequence_recurr(relation_tags) if torch.cuda.is_available(): input_tensor = Variable(
def train(datasets, mode): # optimizer, criterion, args, # JointModel.train() if args.use_RL: mini_batches = get_bags(datasets, relations, args.batchsize) noisy_sentences_vec = Variable( torch.FloatTensor(1, args.hidden_dim).fill_(0)) noisy_vec_mean = torch.mean(noisy_sentences_vec, 0, True) else: mini_batches = get_minibatches(datasets, args.batchsize) batchcnt = len(datasets[0]) // args.batchsize # len(list(mini_batches)) logger.info("********************%s data*********************" % mode) logger.info("number of batches: %s" % batchcnt) NER_correct, NER_total = 0., 0. RE_correct, RE_total = 0., 0. if mode != 'train': # NER_target_all, NER_output_all = None, None # RE_target_all, RE_output_all = None, None NER_target_all2, NER_output_all2 = [], [] RE_target_all2, RE_output_all2 = [], [] NER_output_logits, RE_output_logits = [], [] for b, data in enumerate(mini_batches): if b >= batchcnt: break sentences, pos_lambda, tags, sentences_words, relation_tags, relation_names = data input_tensor, input_length = padding_sequence(sentences, pad_token=0) pos_tensor, input_length = padding_sequence(pos_lambda, pad_token=0) target_tensor, target_length = padding_sequence( tags, pad_token=args.entity_tag_size) # entity tags relation_target_tensor = relation_tags # padding_sequence_recurr(relation_tags) # relation tag if torch.cuda.is_available(): input_tensor = Variable( torch.cuda.LongTensor(input_tensor, device=device)).cuda() target_tensor = Variable( torch.cuda.LongTensor(target_tensor, device=device)).cuda() if args.encoder_model == "BiLSTM": mask = torch.cuda.ByteTensor( (1 - (target_tensor == args.entity_tag_size))).to(device) else: mask = torch.cuda.ByteTensor( (1 - (input_tensor == 0))).to(device) pos_tensor = Variable( torch.cuda.FloatTensor(pos_tensor, device=device)).cuda() relation_target_tensor = Variable( torch.cuda.LongTensor(relation_target_tensor, device=device)).cuda() else: input_tensor = Variable( torch.LongTensor(input_tensor, device=device)) target_tensor = Variable( torch.LongTensor(target_tensor, device=device)) if args.encoder_model == "BiLSTM": mask = torch.ByteTensor( (1 - (target_tensor == args.entity_tag_size))).to(device) else: mask = torch.ByteTensor((1 - (input_tensor == 0))).to(device) pos_tensor = Variable(torch.Tensor(pos_tensor, device=device)) relation_target_tensor = Variable( torch.LongTensor(relation_target_tensor, device=device)) if mode == 'train': optimizer.zero_grad() NER_active_logits, NER_active_labels, RE_output_tag, NER_output_tag, NER_output, BERT_pooled_output = JointModel( input_tensor, pos_tensor, target_tensor, args.batchsize, mask) # , input_length, target_length if args.use_RL: mask_entity = [ list(map(lambda x: 1 if x in [1, 2, 4, 5] else 0, i)) for i in target_tensor ] if torch.cuda.is_available(): mask_entity = torch.cuda.ByteTensor(mask_entity).to(device) else: mask_entity = torch.ByteTensor(mask_entity).to(device) NER_embedding = None for i in range(len(mask_entity)): NER_embedding = torch.mean(NER_output[i][mask_entity[i]], 0).view(1, -1) if NER_embedding is None \ else torch.cat((NER_embedding, torch.mean(NER_output[i][mask_entity[i]], 0).view(1, -1)), 0) RE_rewards, loss_RL, noisy_sentences_vec, noisy_vec_mean = RL_model( BERT_pooled_output, NER_embedding, JointModel.noysy_model, RE_output_tag, relation_target_tensor, noisy_sentences_vec, noisy_vec_mean) if not args.use_RL: loss_entity = criterion(NER_active_logits, NER_active_labels) loss_RE = criterion(RE_output_tag, relation_target_tensor) loss = loss_entity + loss_RE if args.merge_loss: loss.backward() else: loss_entity.backward( retain_graph=True) # retain_graph=True loss_RE.backward(retain_graph=True) if args.use_RL: loss = loss_RL loss_RL.backward() ''' use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False if use_teacher_forcing: # Teacher forcing: Feed the target as the next input for di in range(target_length): decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden) loss += criterion(decoder_output, target_tensor[di]) decoder_input = target_tensor[di] # Teacher forcing else: # Without teacher forcing: use its own predictions as the next input for di in range(target_length): decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden) topv, topi = decoder_output.topk(1) decoder_input = topi.squeeze().detach() # detach from history as input loss += criterion(decoder_output, target_tensor[di]) if decoder_input.item() == EOS_token: break ''' optimizer.step() else: NER_active_logits, NER_active_labels, RE_output_tag, NER_output_tag, _, _ = JointModel( input_tensor, pos_tensor, target_tensor, args.batchsize, mask, True) # , input_length, target_length NER_correct += (torch.argmax(NER_active_logits, -1) == NER_active_labels).sum().item() NER_total += len(NER_active_logits) # temp = 0. # for i in range(len(relation_target_tensor[0])): # target = torch.transpose(relation_target_tensor, 0, 1)[i] # temp += (torch.argmax(RE_output_tag, -1) == target).sum().item() RE_correct += (torch.argmax( RE_output_tag, -1) == relation_target_tensor).sum().item() RE_total += len(RE_output_tag) if mode != 'train': NER_target_all2.append(target_tensor.cpu().tolist( )) # target_tensor, NER_active_labels .numpy() NER_output_all2.append( torch.argmax( NER_output_tag, -1).cpu().tolist()) # NER_output_tag, NER_active_logits NER_output_logits.append(NER_output_tag.detach().cpu().tolist()) RE_output_all2.append( torch.argmax(RE_output_tag, -1).cpu().tolist()) RE_target_all2.append( relation_target_tensor.detach().cpu().tolist()) RE_output_logits.append(RE_output_tag.cpu().tolist()) if b % args.print_batch == 0: logger.info( 'seq-seq model: (%d %.2f%%), NER acc: %.4f, RE acc: %.4f' % (b, float(b) / batchcnt * 100, NER_correct / NER_total, RE_correct / RE_total)) '''if not args.do_train: if NER_target_all is None: NER_target_all = NER_active_labels.to('cpu') NER_output_all = NER_active_logits.to('cpu') else: NER_target_all = torch.cat((NER_target_all.to('cpu'), NER_active_labels.to('cpu')), dim=0) NER_output_all = torch.cat((NER_output_all.to('cpu'), NER_active_logits.to('cpu')), dim=0) if RE_target_all is None: RE_target_all = relation_target_tensor.to('cpu') RE_output_all = RE_output_tag.to('cpu') else: RE_target_all = torch.cat((RE_target_all.to('cpu'), relation_target_tensor.to('cpu')), dim=0) RE_output_all = torch.cat((RE_output_all.to('cpu'), RE_output_tag.to('cpu')), dim=0)''' if mode == 'train': out_losses.append(loss.item()) if b % args.print_batch == 0: logger.info( 'seq-seq model: (%d %.2f%%), loss_NER: %.4f, loss_RE: %.4f, NER acc: %.4f, RE acc: %.4f' % (b, float(b) / batchcnt * 100, loss_entity.item(), loss_RE.item(), NER_correct / NER_total, RE_correct / RE_total)) if mode != 'train': cal_F_score(RE_output_all2, RE_target_all2, NER_target_all2, NER_output_all2, args.batchsize) if args.do_train: if mode == 'test' or (mode == 'dev' and e == args.epochRL - 1): with open( args.output_dir + 'predict_%s_epoch_%s.json' % (mode, e), "a+") as fw: json.dump( { "RE_predict": RE_output_all2, "RE_actual": RE_target_all2, "RE_output_logits": RE_output_logits, "NER_predict": NER_output_all2, "NER_actual": NER_target_all2, "NER_output_logits": NER_output_logits }, fw) else: with open(args.output_dir + 'predict_%s.json' % mode, "a+") as fw: json.dump( { "RE_predict": RE_output_all2, "RE_actual": RE_target_all2, "RE_output_logits": RE_output_logits, "NER_predict": NER_output_all2, "NER_actual": NER_target_all2, "NER_output_logits": NER_output_logits }, fw) # np.save('pred_res/RE_predict', RE_output_all2) # RE_output_all.to('cpu').detach().numpy() # np.save('pred_res/RE_actual', RE_target_all2) # np.save('pred_res/NER_predict', NER_output_all2) # np.save('pred_res/NER_actual', NER_target_all2) '''NER_pred_res = metrics.classification_report(NER_target_all2, NER_output_all2) logger.info('NER Prediction results: \n{}'.format(NER_pred_res)) RE_pred_res = metrics.classification_report(RE_target_all2, RE_output_all2) logger.info('RE Prediction results: \n{}'.format(RE_pred_res))''' else: np.save(args.output_dir + "loss_train", out_losses)
def train(save_dir='saved_weights', parser_name='parser', num_epochs=5, max_iters=-1, print_every_iters=10): """ Trains the model. parser_name is the string prefix used for the filename where the parser is saved after every epoch """ # load dataset load_existing_dump = False print('Loading dataset for training') dataset = load_datasets(load_existing_dump) # HINT: Look in the ModelConfig class for the model's hyperparameters config = dataset.model_config print('Loading embeddings') word_embeddings, pos_embeddings, dep_embeddings = load_embeddings(config) # TODO: For Optional Task, add Twitter and Wikipedia embeddings (do this last) if False: # Switch to True if you want to print examples of feature types print('words: ', len(dataset.word2idx)) print('examples: ', [(k, v) for i, (k, v) in enumerate(dataset.word2idx.items()) if i < 30]) print('\n') print('POS-tags: ', len(dataset.pos2idx)) print(dataset.pos2idx) print('\n') print('dependencies: ', len(dataset.dep2idx)) print(dataset.dep2idx) print('\n') print("some hyperparameters") print(vars(config)) # load parser object (used for Task 2) parser = ParserModel(config, word_embeddings, pos_embeddings, dep_embeddings) # Uncomment the following parser for Task 3 # parser = AnotherParserModel(config, word_embeddings, pos_embeddings, dep_embeddings) device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") parser.to(device) # set save_dir for model if not os.path.exists(save_dir): os.makedirs(save_dir) # create object for loss function loss_fn = F.cross_entropy # create object for an optimizer that updated the weights of our parser # model. Be sure to set the learning rate based on the parameters! optimizer = optim.Adam(parser.parameters(), lr=config.lr) for epoch in range(1, num_epochs + 1): ###### Training ##### # load training set in minibatches for i, (train_x, train_y) in enumerate(get_minibatches([dataset.train_inputs, dataset.train_targets], \ config.batch_size, is_multi_feature_input=True)): word_inputs_batch, pos_inputs_batch, dep_inputs_batch = train_x # Convert the numpy data to pytorch's tensor represetation. They're # numpy objects initially. NOTE: In general, when using Pytorch, # you want to send them to the device that will do the computation # (either a GPU or CPU). You do this by saying "obj.to(device)" # where we've already created the device for you (see above where we # did this for the parser). This ensures your data is running on # the processor you expect it to! word_inputs_batch = torch.from_numpy(np.array(word_inputs_batch)).to(device) pos_inputs_batch = torch.from_numpy(np.array(pos_inputs_batch)).to(device) dep_inputs_batch = torch.from_numpy(np.array(dep_inputs_batch)).to(device) # Convert the labels from 1-hot vectors to a list of which index was # 1, which is what Pytorch expects. HINT: look for the "argmax" # function in numpy. labels = np.argmax(train_y, axis=1) # Convert the label to pytorch's tensor labels = torch.from_numpy(labels).to(device) # This is just a quick hack so you can cut training short to see how # things are working. In the final model, make sure to use all the data! if max_iters >= 0 and i > max_iters: break # Some debugging information for you if i == 0 and epoch == 1: print("size of word inputs: ", word_inputs_batch.size()) print("size of pos inputs: ", pos_inputs_batch.size()) print("size of dep inputs: ", dep_inputs_batch.size()) print("size of labels: ", labels.size()) # #### Backprop & Update weights #### # # Before the backward pass, use the optimizer object to zero all of # the gradients for the variables optimizer.zero_grad() # For the current batch of inputs, run a full forward pass through the # data and get the outputs for each item's prediction. # These are the raw outputs, which represent the activations for # prediction over valid transitions. outputs = parser.forward(word_inputs_batch, pos_inputs_batch, dep_inputs_batch) # Compute the loss for the outputs with the labels. Note that for # your particular loss (cross-entropy) it will compute the softmax # for you, so you can safely pass in the raw activations. loss = loss_fn(outputs, labels) # Backward pass: compute gradient of the loss with respect to model parameters loss.backward() # Perform 1 update using the optimizer optimizer.step() # Every 10 batches, print out some reporting so we can see convergence if i % print_every_iters == 0: print ('Epoch: %d [%d], loss: %1.3f, acc: %1.3f' \ % (epoch, i, loss.item(), int((outputs.argmax(1)==labels).sum())/len(labels))) print("End of epoch") # save model save_file = os.path.join(save_dir, '%s-epoch-%d.mdl' % (parser_name, epoch)) print('Saving current state of model to %s' % save_file) torch.save(parser, save_file) ###### Validation ##### print('Evaluating on valudation data after epoch %d' % epoch) # Once we're in test/validation time, we need to indicate that we are in # "evaluation" mode. This will turn off things like Dropout so that # we're not randomly zero-ing out weights when it might hurt performance parser.eval() # Compute the current model's UAS score on the validation (development) # dataset. Note that we can use this held-out data to tune the # hyper-parameters of the model but we should never look at the test # data until we want to report the very final result. compute_dependencies(parser, device, dataset.valid_data, dataset) valid_UAS = get_UAS(dataset.valid_data) print("- validation UAS: {:.2f}".format(valid_UAS * 100.0)) # Once we're done with test/validation, we need to indicate that we are back in # "train" mode. This will turn back on things like Dropout parser.train() return parser
encoder = torch.load(args.modelPath+"model_encoder_epoch24.pkl", map_location=device) decoder = torch.load(args.modelPath+"model_decoder_epoch24.pkl", map_location=device) if torch.cuda.is_available(): encoder = encoder.cuda() decoder = decoder.cuda() encoder.eval() decoder.eval() # encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=l2) # SGD # decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, weight_decay=l2) # RE_optimizer = optim.Adam(RE_model.parameters(), lr=learning_rate, weight_decay=l2) # ********************Train data********************* if args.test: mini_batches = get_minibatches(train_datasets, args.batchsize) batchcnt = len(dev_datasets[0]) // args.batchsize # len(list(mini_batches)) for b, data in enumerate(mini_batches): if b >= batchcnt: break sentences, tags = data input_tensor, input_length = padding_sequence(sentences, pad_token=args.embedding_size) target_tensor, target_length = padding_sequence(tags, pad_token=args.entity_tag_size) if torch.cuda.is_available(): input_tensor = Variable(torch.cuda.LongTensor(input_tensor, device=device)).cuda() target_tensor = Variable(torch.cuda.LongTensor(target_tensor, device=device)).cuda() else: input_tensor = Variable(torch.LongTensor(input_tensor, device=device)) target_tensor = Variable(torch.LongTensor(target_tensor, device=device)) RE_output = eval_model(encoder, decoder, input_tensor, args.batchsize)
def minibatches(data, batch_size): x = np.array([d[0] for d in data]) y = np.array([d[2] for d in data]) one_hot = np.zeros((y.size, 3)) one_hot[np.arange(y.size), y] = 1 return get_minibatches([x, one_hot], batch_size)
def minibatches(data, batch_size, n_classes): x = np.array([d[0] for d in data]) y = np.array([d[2] for d in data]) one_hot = np.zeros((y.size, n_classes)) one_hot[np.arange(y.size), y] = 1 return get_minibatches([x, one_hot], batch_size)
def train(save_dir='saved_weights', parser_name='parser', num_epochs=5, max_iters=-1, print_every_iters=10): """ Trains the model. parser_name is the string prefix used for the filename where the parser is saved after every epoch """ # load dataset load_existing_dump = False print('Loading dataset for training') dataset = load_datasets(load_existing_dump) config = dataset.model_config print('Loading embeddings') word_embeddings, pos_embeddings, dep_embeddings = load_embeddings(config) if False: # Switch to True if you want to print examples of feature types print('words: ', len(dataset.word2idx)) print('examples: ', [(k, v) for i, (k, v) in enumerate(dataset.word2idx.items()) if i < 30]) print('\n') print('POS-tags: ', len(dataset.pos2idx)) print(dataset.pos2idx) print('\n') print('dependencies: ', len(dataset.dep2idx)) print(dataset.dep2idx) print('\n') print("some hyperparameters") print(vars(config)) # load parser object parser = ParserModel(config, word_embeddings, pos_embeddings, dep_embeddings) device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") parser.to(device) # set save_dir for model if not os.path.exists(save_dir): os.makedirs(save_dir) # create object for loss function loss_fn = nn.CrossEntropyLoss() # create object for an optimizer that updated the weights of parser model. optimizer = torch.optim.SGD(parser.parameters(), lr=config.lr) loss_list = [] acc_list = [] uas_list = [] for epoch in range(1, num_epochs + 1): ###### Training ##### # load training set in minibatches for i, (train_x, train_y) in enumerate(get_minibatches([dataset.train_inputs, dataset.train_targets], \ config.batch_size, is_multi_feature_input=True)): word_inputs_batch, pos_inputs_batch, dep_inputs_batch = train_x # Convert the numpy data to pytorch's tensor represetation. They're # numpy objects initially. word_inputs_batch = torch.tensor(word_inputs_batch).to(device) pos_inputs_batch = torch.tensor(pos_inputs_batch).to(device) dep_inputs_batch = torch.tensor(dep_inputs_batch).to(device) # Convert the labels from 1-hot vectors to a list of which index was # 1, which is what Pytorch expects. labels = np.argmax(train_y, axis=1) # Convert the label to pytorch's tensor labels = torch.tensor(labels) if max_iters >= 0 and i > max_iters: break if i == 0 and epoch == 1: print("size of word inputs: ", word_inputs_batch.size()) print("size of pos inputs: ", pos_inputs_batch.size()) print("size of dep inputs: ", dep_inputs_batch.size()) print("size of labels: ", labels.size()) #### Backprop & Update weights #### # Before the backward pass, use the optimizer object to zero all of # the gradients for the variables optimizer.zero_grad() # For the current batch of inputs, run a full forward pass through the # data and get the outputs for each item's prediction. # These are the raw outputs, which represent the activations for # prediction over valid transitions. outputs = parser(word_inputs_batch, pos_inputs_batch, dep_inputs_batch) # TODO # Compute the loss for the outputs with the labels. loss = None loss = loss_fn(outputs, labels) # Backward pass: compute gradient of the loss with respect to model parameters loss.backward() # Perform 1 update using the optimizer optimizer.step() # Every 10 batches, print out some reporting so I can see convergence if i % print_every_iters == 0: print ('Epoch: %d [%d], loss: %1.3f, acc: %1.3f' \ % (epoch, i, loss.item(), int((outputs.argmax(1)==labels).sum())/len(labels))) print("End of epoch") # save model save_file = os.path.join(save_dir, '%s-epoch-%d.mdl' % (parser_name, epoch)) print('Saving current state of model to %s' % save_file) torch.save(parser, save_file) ###### Validation ##### print('Evaluating on valudation data after epoch %d' % epoch) # Once we're in test/validation time, we need to indicate that we are in # "evaluation" mode. This will turn off things like Dropout so that # we're not randomly zero-ing out weights when it might hurt performance parser.eval() # Compute the current model's UAS score on the validation (development) # dataset. compute_dependencies(parser, device, dataset.valid_data, dataset) valid_UAS = get_UAS(dataset.valid_data) print("- validation UAS: {:.2f}".format(valid_UAS * 100.0)) loss_list.append(loss.item()) acc_list.append(int((outputs.argmax(1) == labels).sum()) / len(labels)) uas_list.append(valid_UAS * 100.0) # Once we're done with test/validation, we need to indicate that we are back in # "train" mode. This will turn back on things like Dropout parser.train() score = pd.DataFrame({'loss': loss_list, 'acc': acc_list, 'uas': uas_list}) score.to_csv(r"score.csv", index=True, header=True) return parser
def minibatches(dataX, dataY, sentLen, mask, batch_size): # x = np.array([d[0] for d in data]) # y = np.array([d[2] for d in data]) # one_hot = np.zeros((y.size, 3)) # one_hot[np.arange(y.size), y] = 1 return get_minibatches(dataX, dataY, sentLen, mask, batch_size)
def trainEpoches(encoder, decoder, criterion, print_every=10, learning_rate=0.001, l2=0.0001): start = time.time() out_losses = [] print_loss_total = 0 # Reset every print_every # plot_loss_total = 0 # Reset every plot_every encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=l2) # SGD decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, weight_decay=l2) # training_pairs = [tensorsFromPair(random.choice(pairs)) # for i in range(n_iters)] # for iter in range(1, n_iters + 1): # training_pair = training_pairs[iter - 1] # for epoch in range(epoches): # i = 0 mini_batches = get_minibatches(train_datasets, BATCH) batches_size = len(train_datasets[0]) // BATCH # len(list(mini_batches)) for i, data in enumerate(mini_batches): if i == batches_size: break # for i, data in enumerate(train_dataloader, 1): sentences, tags = data input_tensor, input_length = padding_sequence(sentences, pad_token=EMBEDDING_SIZE) target_tensor, target_length = padding_sequence(tags, pad_token=TAG_SIZE) if torch.cuda.is_available(): input_tensor = Variable( torch.cuda.LongTensor(input_tensor, device=device)).cuda() target_tensor = Variable( torch.cuda.LongTensor(target_tensor, device=device)).cuda() else: input_tensor = Variable( torch.LongTensor(input_tensor, device=device)) target_tensor = Variable( torch.LongTensor(target_tensor, device=device)) loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) # , input_length, target_length out_losses.append(loss) print_loss_total += loss # plot_loss_total += loss if i % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print(' (%d %d%%) %.4f' % (i, float(i) / batches_size * 100, print_loss_avg)) # print('%s (%d %d%%) %.4f' % (timeSince(start, float(i) / batches_size), # i, float(i) / batches_size * 100, print_loss_avg)) # plot_loss_avg = plot_loss_total / plot_every # plot_losses.append(plot_loss_avg) # plot_loss_total = 0 # i += 1 np.save("loss", out_losses) if epoch % 10 == 0: model_name = "./model/model_encoder_epoch" + str(epoch) + ".pkl" torch.save(encoder, model_name) model_name = "./model/model_decoder_epoch" + str(epoch) + ".pkl" torch.save(decoder, model_name) print("Model has been saved")
def train(save_dir='saved_weights', parser_name='parser', num_epochs=5, max_iters=-1, print_every_iters=10, layer_num=1): """ Trains the model. parser_name is the string prefix used for the filename where the parser is saved after every epoch """ # load dataset load_existing_dump = False print('Loading dataset for training') dataset = load_datasets(load_existing_dump) # HINT: Look in the ModelConfig class for the model's hyperparameters config = dataset.model_config print('Loading embeddings') word_embeddings, pos_embeddings, dep_embeddings = load_embeddings(config) # TODO: For Task 3, add Twitter and Wikipedia embeddings (do this last) if False: # Switch to True if you want to print examples of feature types print('words: ', len(dataset.word2idx)) print('examples: ', [(k, v) for i, (k, v) in enumerate(dataset.word2idx.items()) if i < 30]) print('\n') print('POS-tags: ', len(dataset.pos2idx)) print(dataset.pos2idx) print('\n') print('dependencies: ', len(dataset.dep2idx)) print(dataset.dep2idx) print('\n') print("some hyperparameters") print(vars(config)) # load parser object if layer_num <= 1: parser = ParserModel(config, word_embeddings, pos_embeddings, dep_embeddings) else: parser = MultiLayer_ParserModel(config, word_embeddings, pos_embeddings, dep_embeddings, layer_num) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") parser.to(device) # set save_dir for model if not os.path.exists(save_dir): os.makedirs(save_dir) # create object for loss function loss_fn = nn.CrossEntropyLoss() # create object for an optimizer that updated the weights of our parser model optimizer = torch.optim.Adam(parser.parameters(), lr=config.lr) # initialize lists to plot data loss_list, acc_list, uas_list = [], [], [] for epoch in range(1, num_epochs + 1): ###### Training ##### # load training set in minibatches for i, (train_x, train_y) in enumerate(get_minibatches([dataset.train_inputs, dataset.train_targets], config.batch_size, is_multi_feature_input=True)): word_inputs_batch, pos_inputs_batch, dep_inputs_batch = train_x # Convert the numpy data to pytorch's tensor represetation. word_inputs_batch = torch.tensor(word_inputs_batch).to(device) pos_inputs_batch = torch.tensor(pos_inputs_batch).to(device) dep_inputs_batch = torch.tensor(dep_inputs_batch).to(device) # Convert the labels from 1-hot vectors to a list of which index was 1, then to pytorch tensor labels = torch.tensor(np.argmax(train_y, axis=1)).to(device) # This is just a quick hack so you can cut training short to see how things are working if max_iters >= 0 and i > max_iters: break # Some debugging information for you if i == 0 and epoch == 1: print("size of word inputs: ", word_inputs_batch.size()) print("size of pos inputs: ", pos_inputs_batch.size()) print("size of dep inputs: ", dep_inputs_batch.size()) print("size of labels: ", labels.size()) #### Backprop & Update weights #### # Before the backward pass, use the optimizer object to zero all of the gradients for the variables optimizer.zero_grad() # For the current batch of inputs, run a full forward pass through the data and get the outputs for each item's prediction outputs = parser(word_inputs_batch, pos_inputs_batch, dep_inputs_batch) # Compute the loss for the outputs with the labels loss = loss_fn(outputs, labels) # Backward pass: compute gradient of the loss with respect to model parameters loss.backward() # Perform 1 update using the optimizer optimizer.step() # Every 10 batches, print out some reporting so we can see convergence if i % print_every_iters == 0: print ('Epoch: %d [%d], loss: %1.3f, acc: %1.3f' \ % (epoch, i, loss.item(), int((outputs.argmax(1)==labels).sum())/len(labels))) print("End of epoch") # save model save_file = os.path.join(save_dir, '%s-epoch-%d.mdl' % (parser_name, epoch)) print('Saving current state of model to %s' % save_file) torch.save(parser, save_file) ###### Validation ##### print('Evaluating on valudation data after epoch %d' % epoch) # Once we're in test/validation time, we need to indicate that we are in "evaluation" mode parser.eval() # Compute the current model's UAS score on the validation (development) dataset compute_dependencies(parser, device, dataset.valid_data, dataset) valid_UAS = get_UAS(dataset.valid_data) print("- validation UAS: {:.2f}".format(valid_UAS * 100.0)) # Append the computed values to plotting lists loss_list.append(loss.item()) acc_list.append(int((outputs.argmax(1)==labels).sum())/len(labels)) uas_list.append(valid_UAS*100.0) # Once we're done with test/validation, we need to indicate that we are back in "train" mode parser.train() # Plot the data! epoch_size = np.arange(1, num_epochs + 1) loss_plot = {"Epoch":epoch_size, "Loss":np.array(loss_list)} seaborn.lineplot(x="Epoch", y="Loss", data=loss_plot) plot.xlabel("Epoch") plot.ylabel("Loss") plot.title("Training Loss vs Time") plot.show() acc_plot = {"Epoch":epoch_size, "Accuracy":np.array(acc_list)} seaborn.lineplot(x="Epoch", y="Accuracy", data=acc_plot) plot.xlabel("Epoch") plot.ylabel("Accuracy") plot.title("Training Accuracy vs Time") plot.show() uas_plot = {"Epoch":epoch_size, "UAS":np.array(uas_list)} seaborn.lineplot(x="Epoch", y="UAS", data=uas_plot) plot.xlabel("Epoch") plot.ylabel("UAS") plot.title("Unlabeled Attachment Score vs Time") plot.show() return parser