def train(X_train, y_train, X_dev, y_dev, X_test, y_test): num_labels = NUM_EMO vocab_size = VOCAB_SIZE print('NUM of VOCAB' + str(vocab_size)) train_data = EmotionDataLoader(X_train, y_train, PAD_LEN) train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True) dev_data = EmotionDataLoader(X_dev, y_dev, PAD_LEN) dev_loader = DataLoader(dev_data, batch_size=int(BATCH_SIZE / 3) + 2, shuffle=False) test_data = EmotionDataLoader(X_test, y_test, PAD_LEN) test_loader = DataLoader(test_data, batch_size=int(BATCH_SIZE / 3) + 2, shuffle=False) model = AttentionLSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, vocab_size, num_labels, BATCH_SIZE, att_mode=opt.attention, soft_last=False, use_glove=USE_GLOVE, add_linear=ADD_LINEAR, max_pool=MAX_POOLING) if USE_GLOVE: model.load_embedding(tokenizer.get_embeddings()) # multi-GPU # model = nn.DataParallel(model) model.cuda() if opt.loss == 'ce': loss_criterion = nn.CrossEntropyLoss() # print('Using ce loss') elif opt.loss == 'focal': loss_criterion = FocalLoss(gamma=opt.focal, reduce=True) print('Using focal loss, gamma=', opt.focal) else: raise Exception('loss option not recognised') optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) es = EarlyStopping(patience=PATIENCE) old_model = None for epoch in range(1, 300): print('Epoch: ' + str(epoch) + '===================================') train_loss = 0 model.train() for i, (data, seq_len, label) in tqdm(enumerate(train_loader), total=len(train_data) / BATCH_SIZE): optimizer.zero_grad() data_text = [tokenizer.decode_ids(x) for x in data] with torch.no_grad(): character_ids = batch_to_ids(data_text).cuda() elmo_emb = elmo(character_ids)['elmo_representations'] elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2 # avg of two layers emoji_tokenized, _, _ = st.tokenize_sentences( [' '.join(x) for x in data_text]) emoji_encoding = emoji_model( torch.LongTensor(emoji_tokenized.astype(np.int32))) y_pred = model(data.cuda(), seq_len, elmo_emb, emoji_encoding.cuda()) loss = loss_criterion(y_pred, label.view(-1).cuda()) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), CLIPS) optimizer.step() train_loss += loss.data.cpu().numpy() * data.shape[0] del y_pred, loss test_loss = 0 model.eval() for _, (_data, _seq_len, _label) in enumerate(dev_loader): with torch.no_grad(): data_text = [tokenizer.decode_ids(x) for x in _data] character_ids = batch_to_ids(data_text).cuda() elmo_emb = elmo(character_ids)['elmo_representations'] elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2 # avg of two layers emoji_tokenized, _, _ = st.tokenize_sentences( [' '.join(x) for x in data_text]) emoji_encoding = emoji_model( torch.LongTensor(emoji_tokenized.astype(np.int32))) y_pred = model(_data.cuda(), _seq_len, elmo_emb, emoji_encoding.cuda()) loss = loss_criterion(y_pred, _label.view(-1).cuda()) test_loss += loss.data.cpu().numpy() * _data.shape[0] del y_pred, loss print("Train Loss: " + str(train_loss / len(train_data)) + \ " Evaluation: " + str(test_loss / len(dev_data))) if es.step(test_loss): # overfitting del model print('overfitting, loading best model ...') model = old_model break else: if es.is_best(): if old_model is not None: del old_model print('saving best model ...') old_model = deepcopy(model) else: print('not best model, ignoring ...') if old_model is None: old_model = deepcopy(model) with open(f'lstm_elmo_deepmoji_{opt.dataset}_model.pt', 'bw') as f: torch.save(model.state_dict(), f) pred_list = [] model.eval() for _, (_data, _seq_len, _label) in enumerate(test_loader): with torch.no_grad(): data_text = [tokenizer.decode_ids(x) for x in _data] character_ids = batch_to_ids(data_text).cuda() elmo_emb = elmo(character_ids)['elmo_representations'] elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2 # avg of two layers emoji_tokenized, _, _ = st.tokenize_sentences( [' '.join(x) for x in data_text]) emoji_encoding = emoji_model( torch.LongTensor(emoji_tokenized.astype(np.int32))) y_pred = model(_data.cuda(), _seq_len, elmo_emb, emoji_encoding.cuda()) pred_list.append( y_pred.data.cpu().numpy()) # x[np.where( x > 3.0 )] del y_pred pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1) return pred_list
def one_fold(num_fold, train_index, dev_index): print("Training on fold:", num_fold) X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index] y_train, y_dev = y[train_index], y[dev_index] # construct data loader train_data_set = DataSet(X_train, y_train, SENT_PAD_LEN) train_data_loader = DataLoader(train_data_set, batch_size=BATCH_SIZE, shuffle=True) dev_data_set = DataSet(X_dev, y_dev, SENT_PAD_LEN) dev_data_loader = DataLoader(dev_data_set, batch_size=BATCH_SIZE, shuffle=False) gradient_accumulation_steps = 1 num_train_steps = int( len(train_data_set) / BATCH_SIZE / gradient_accumulation_steps * MAX_EPOCH) pred_list_test_best = None final_pred_best = None # This is to prevent model diverge, once happen, retrain while True: is_diverged = False model = BERT_classifer.from_pretrained(BERT_MODEL) model.add_output_layer(BERT_MODEL, NUM_EMO) model = nn.DataParallel(model) model.cuda() # BERT optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=0.1, t_total=num_train_steps) if opt.w == 1: weight_list = [0.3, 0.3, 0.3, 1.7] weight_list_binary = [2 - weight_list[-1], weight_list[-1]] elif opt.w == 2: weight_list = [ 0.3198680179, 0.246494733, 0.2484349259, 1.74527696 ] weight_list_binary = [2 - weight_list[-1], weight_list[-1]] weight_list = [x**FLAT for x in weight_list] weight_label = torch.Tensor(weight_list).cuda() weight_list_binary = [x**FLAT for x in weight_list_binary] weight_binary = torch.Tensor(weight_list_binary).cuda() print('binary loss reweight = weight_list_binary', weight_list_binary) # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary) # if opt.loss == 'focal': loss_criterion = FocalLoss(gamma=opt.focal, reduce=False) loss_criterion_binary = FocalLoss(gamma=opt.focal, reduce=False) # elif opt.loss == 'ce': loss_criterion = nn.CrossEntropyLoss(reduce=False) loss_criterion_binary = nn.CrossEntropyLoss(reduce=False) # loss_criterion_emo_only = nn.MSELoss() # es = EarlyStopping(min_delta=0.005, patience=EARLY_STOP_PATIENCE) es = EarlyStopping(patience=EARLY_STOP_PATIENCE) final_pred_best = None final_pred_list_test = None pred_list_test = None for num_epoch in range(MAX_EPOCH): print('Begin training epoch:', num_epoch) sys.stdout.flush() train_loss = 0 model.train() for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in tqdm(enumerate(train_data_loader), total=len(train_data_set) / BATCH_SIZE): optimizer.zero_grad() if USE_TOKEN_TYPE: pred, pred2, pred3 = model(tokens.cuda(), masks.cuda(), segments.cuda()) else: pred, pred2, pred3 = model(tokens.cuda(), masks.cuda()) loss_label = loss_criterion(pred, e_c.view(-1).cuda()).cuda() loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \ e_c.view(-1).shape[0] loss_binary = loss_criterion_binary( pred2, e_c_binary.view(-1).cuda()).cuda() loss_binary = torch.matmul( torch.gather(weight_binary, 0, e_c_binary.view(-1).cuda()), loss_binary) / e_c.view(-1).shape[0] loss_emo = loss_criterion_emo_only(pred3, e_c_emo.cuda()) loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2) # training trilogy loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP) optimizer.step() train_loss += loss.data.cpu().numpy() * tokens.shape[0] del loss, pred # Evaluate model.eval() dev_loss = 0 # pred_list = [] # gold_list = [] for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in enumerate(dev_data_loader): with torch.no_grad(): if USE_TOKEN_TYPE: pred, pred2, pred3 = model(tokens.cuda(), masks.cuda(), segments.cuda()) else: pred, pred2, pred3 = model(tokens.cuda(), masks.cuda()) loss_label = loss_criterion( pred, e_c.view(-1).cuda()).cuda() loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \ e_c.view(-1).shape[0] loss_binary = loss_criterion_binary( pred2, e_c_binary.view(-1).cuda()).cuda() loss_binary = torch.matmul( torch.gather(weight_binary, 0, e_c_binary.view(-1).cuda()), loss_binary) / e_c.view(-1).shape[0] loss_emo = loss_criterion_emo_only( pred3, e_c_emo.cuda()) loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2) dev_loss += loss.data.cpu().numpy() * tokens.shape[0] # pred_list.append(pred.data.cpu().numpy()) # gold_list.append(e_c.numpy()) del pred, loss # pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1) # gold_list = np.concatenate(gold_list, axis=0) print('Training loss:', train_loss / len(train_data_set), end='\t') print('Dev loss:', dev_loss / len(dev_data_set)) # print(classification_report(gold_list, pred_list, target_names=EMOS)) # get_metrics(pred_list, gold_list) # checking diverge if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4: print("Model diverged, retry") is_diverged = True break if es.step(dev_loss): # overfitting print('overfitting, loading best model ...') if num_epoch == 1: is_diverged = True final_pred_best = deepcopy(final_pred_list_test) pred_list_test_best = deepcopy(pred_list_test) break else: if es.is_best(): print('saving best model ...') if final_pred_best is not None: del final_pred_best final_pred_best = deepcopy(final_pred_list_test) if pred_list_test_best is not None: del pred_list_test_best pred_list_test_best = deepcopy(pred_list_test) else: print('not best model, ignoring ...') if final_pred_best is None: final_pred_best = deepcopy(final_pred_list_test) if pred_list_test_best is None: pred_list_test_best = deepcopy(pred_list_test) print('Gold Dev ...') pred_list_test = [] model.eval() for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in enumerate(gold_dev_data_loader): with torch.no_grad(): if USE_TOKEN_TYPE: pred, _, _ = model(tokens.cuda(), masks.cuda(), segments.cuda()) else: pred, _, _ = model(tokens.cuda(), masks.cuda()) pred_list_test.append(pred.data.cpu().numpy()) pred_list_test = np.argmax(np.concatenate(pred_list_test, axis=0), axis=1) # get_metrics(load_dev_labels('data/dev.txt'), pred_list_test) print('Gold Test ...') final_pred_list_test = [] model.eval() for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in enumerate(gold_test_data_loader): with torch.no_grad(): if USE_TOKEN_TYPE: pred, _, _ = model(tokens.cuda(), masks.cuda(), segments.cuda()) else: pred, _, _ = model(tokens.cuda(), masks.cuda()) final_pred_list_test.append(pred.data.cpu().numpy()) final_pred_list_test = np.argmax(np.concatenate( final_pred_list_test, axis=0), axis=1) # get_metrics(load_dev_labels('data/test.txt'), final_pred_list_test) if is_diverged: print("Reinitialize model ...") del model continue all_fold_results.append(pred_list_test_best) real_test_results.append(final_pred_best) del model break
def one_fold(num_fold, train_index, dev_index): print("Training on fold:", num_fold) X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index] y_train, y_dev = y[train_index], y[dev_index] # construct data loader # for one fold, test data comes from k fold split. train_data_set = TrainDataSet(X_train, y_train, EMAI_PAD_LEN, SENT_PAD_LEN, word2id, use_unk=True) dev_data_set = TrainDataSet(X_dev, y_dev, EMAI_PAD_LEN, SENT_PAD_LEN, word2id, use_unk=True) dev_data_loader = DataLoader(dev_data_set, batch_size=BATCH_SIZE, shuffle=False) # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") final_pred_best = None # This is to prevent model diverge, once happen, retrain while True: is_diverged = False # Model is defined in HierarchicalPredictor model = HierarchicalAttPredictor(SENT_EMB_DIM, SENT_HIDDEN_SIZE, CTX_LSTM_DIM, num_of_vocab, SENT_PAD_LEN, id2word, USE_ELMO=True, ADD_LINEAR=False) model.load_embedding(emb) model.deepmoji_model.load_specific_weights( PRETRAINED_PATH, exclude_names=['output_layer']) model.cuda() # model = nn.DataParallel(model) # model.to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, amsgrad=True) # # optimizer = optim.SGD(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=GAMMA) # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary) # if opt.loss == 'focal': loss_criterion = FocalLoss(gamma=opt.focal) elif opt.loss == 'ce': loss_criterion = nn.BCELoss() es = EarlyStopping(patience=EARLY_STOP_PATIENCE) final_pred_list_test = None result_print = {} for num_epoch in range(MAX_EPOCH): # to ensure shuffle at ever epoch train_data_loader = DataLoader(train_data_set, batch_size=BATCH_SIZE, shuffle=True) print('Begin training epoch:', num_epoch, end='...\t') sys.stdout.flush() # stepping scheduler scheduler.step(num_epoch) print('Current learning rate', scheduler.get_lr()) ## Training step train_loss = 0 model.train() for i, (a, a_len, emoji_a, e_c) \ in tqdm(enumerate(train_data_loader), total=len(train_data_set)/BATCH_SIZE): optimizer.zero_grad() e_c = e_c.type(torch.float) pred = model(a.cuda(), a_len, emoji_a.cuda()) loss_label = loss_criterion(pred.squeeze(1), e_c.view(-1).cuda()).cuda() # training trilogy loss_label.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP) optimizer.step() train_loss += loss_label.data.cpu().numpy() * a.shape[0] del pred, loss_label ## Evaluatation step model.eval() dev_loss = 0 # pred_list = [] for i, (a, a_len, emoji_a, e_c) in enumerate(dev_data_loader): with torch.no_grad(): e_c = e_c.type(torch.float) pred = model(a.cuda(), a_len, emoji_a.cuda()) loss_label = loss_criterion( pred.squeeze(1), e_c.view(-1).cuda()).cuda() dev_loss += loss_label.data.cpu().numpy() * a.shape[0] # pred_list.append(pred.data.cpu().numpy()) # gold_list.append(e_c.numpy()) del pred, loss_label print('Training loss:', train_loss / len(train_data_set), end='\t') print('Dev loss:', dev_loss / len(dev_data_set)) # print(classification_report(gold_list, pred_list, target_names=EMOS)) # get_metrics(pred_list, gold_list) # Gold Test testing print('Final test testing...') final_pred_list_test = [] model.eval() for i, (a, a_len, emoji_a) in enumerate(final_test_data_loader): with torch.no_grad(): pred = model(a.cuda(), a_len, emoji_a.cuda()) final_pred_list_test.append(pred.data.cpu().numpy()) del a, pred print("final_pred_list_test", len(final_pred_list_test)) final_pred_list_test = np.concatenate(final_pred_list_test, axis=0) final_pred_list_test = np.squeeze(final_pred_list_test, axis=1) print("final_pred_list_test_concat", len(final_pred_list_test)) accuracy, precision, recall, f1 = get_metrics( np.asarray(final_test_target_list), np.asarray(final_pred_list_test)) result_print.update( {num_epoch: [accuracy, precision, recall, f1]}) if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4: print("Model diverged, retry") is_diverged = True break if es.step(dev_loss): # overfitting print('overfitting, loading best model ...') break else: if es.is_best(): print('saving best model ...') if final_pred_best is not None: del final_pred_best final_pred_best = deepcopy(final_pred_list_test) else: print('not best model, ignoring ...') if final_pred_best is None: final_pred_best = deepcopy(final_pred_list_test) with open(result_path, 'wb') as w: pkl.dump(result_print, w) if is_diverged: print("Reinitialize model ...") del model continue real_test_results.append(np.asarray(final_pred_best)) # saving model for inference torch.save(model.state_dict(), opt.out_path) del model break
def one_fold(num_fold, train_index, dev_index): print("Training on fold:", num_fold) X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index] y_train, y_dev = y[train_index], y[dev_index] # construct data loader train_data_set = TrainDataSet(X_train, y_train, CONV_PAD_LEN, SENT_PAD_LEN, word2id, use_unk=True) dev_data_set = TrainDataSet(X_dev, y_dev, CONV_PAD_LEN, SENT_PAD_LEN, word2id, use_unk=True) dev_data_loader = DataLoader(dev_data_set, batch_size=BATCH_SIZE, shuffle=False) # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") pred_list_test_best = None final_pred_best = None # This is to prevent model diverge, once happen, retrain while True: is_diverged = False # Model is defined in HierarchicalPredictor model = HierarchicalPredictor(SENT_EMB_DIM, SENT_HIDDEN_SIZE, num_of_vocab, USE_ELMO=True, ADD_LINEAR=False) model.load_embedding(emb) model.deepmoji_model.load_specific_weights( PRETRAINED_PATH, exclude_names=['output_layer']) model.cuda() # model = nn.DataParallel(model) # model.to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, amsgrad=True) # # optimizer = optim.SGD(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=opt.gamma) if opt.w == 1: weight_list = [0.3, 0.3, 0.3, 1.7] weight_list_binary = [2 - weight_list[-1], weight_list[-1]] elif opt.w == 2: weight_list = [ 0.3198680179, 0.246494733, 0.2484349259, 1.74527696 ] weight_list_binary = [2 - weight_list[-1], weight_list[-1]] else: raise ValueError weight_list = [x**FLAT for x in weight_list] weight_label = torch.Tensor(weight_list).cuda() weight_list_binary = [x**FLAT for x in weight_list_binary] weight_binary = torch.Tensor(weight_list_binary).cuda() print('classification reweight: ', weight_list) print('binary loss reweight = weight_list_binary', weight_list_binary) # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary) # if opt.loss == 'focal': loss_criterion = FocalLoss(gamma=opt.focal, reduce=False) loss_criterion_binary = FocalLoss(gamma=opt.focal, reduce=False) # elif opt.loss == 'ce': loss_criterion = nn.CrossEntropyLoss(reduce=False) loss_criterion_binary = nn.CrossEntropyLoss(reduce=False) # loss_criterion_emo_only = nn.MSELoss() es = EarlyStopping(patience=EARLY_STOP_PATIENCE) # best_model = None final_pred_list_test = None pred_list_test = None for num_epoch in range(MAX_EPOCH): # to ensure shuffle at ever epoch train_data_loader = DataLoader(train_data_set, batch_size=BATCH_SIZE, shuffle=True) print('Begin training epoch:', num_epoch, end='...\t') sys.stdout.flush() # stepping scheduler scheduler.step(num_epoch) print('Current learning rate', scheduler.get_lr()) train_loss = 0 model.train() for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c, e_c, e_c_binary, e_c_emo) \ in tqdm(enumerate(train_data_loader), total=len(train_data_set)/BATCH_SIZE): optimizer.zero_grad() elmo_a = elmo_encode(a) elmo_b = elmo_encode(b) elmo_c = elmo_encode(c) pred, pred2, pred3 = model(a.cuda(), a_len, b.cuda(), b_len, c.cuda(), c_len, emoji_a.cuda(), emoji_b.cuda(), emoji_c.cuda(), elmo_a, elmo_b, elmo_c) loss_label = loss_criterion(pred, e_c.view(-1).cuda()).cuda() loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \ e_c.view(-1).shape[0] loss_binary = loss_criterion_binary( pred2, e_c_binary.view(-1).cuda()).cuda() loss_binary = torch.matmul( torch.gather(weight_binary, 0, e_c_binary.view(-1).cuda()), loss_binary) / e_c.view(-1).shape[0] loss_emo = loss_criterion_emo_only(pred3, e_c_emo.cuda()) loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2) # loss = torch.matmul(torch.gather(weight, 0, trg.view(-1).cuda()), loss) / trg.view(-1).shape[0] # training trilogy loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP) optimizer.step() train_loss += loss.data.cpu().numpy() * a.shape[0] del pred, loss, elmo_a, elmo_b, elmo_c, e_c_emo, loss_binary, loss_label, loss_emo # Evaluate model.eval() dev_loss = 0 # pred_list = [] # gold_list = [] for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c, e_c, e_c_binary, e_c_emo)\ in enumerate(dev_data_loader): with torch.no_grad(): elmo_a = elmo_encode(a) elmo_b = elmo_encode(b) elmo_c = elmo_encode(c) pred, pred2, pred3 = model(a.cuda(), a_len, b.cuda(), b_len, c.cuda(), c_len, emoji_a.cuda(), emoji_b.cuda(), emoji_c.cuda(), elmo_a, elmo_b, elmo_c) loss_label = loss_criterion( pred, e_c.view(-1).cuda()).cuda() loss_label = torch.matmul( torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / e_c.view(-1).shape[0] loss_binary = loss_criterion_binary( pred2, e_c_binary.view(-1).cuda()).cuda() loss_binary = torch.matmul( torch.gather(weight_binary, 0, e_c_binary.view(-1).cuda()), loss_binary) / e_c.view(-1).shape[0] loss_emo = loss_criterion_emo_only( pred3, e_c_emo.cuda()) loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2) dev_loss += loss.data.cpu().numpy() * a.shape[0] # pred_list.append(pred.data.cpu().numpy()) # gold_list.append(e_c.numpy()) del pred, loss, elmo_a, elmo_b, elmo_c, e_c_emo, loss_binary, loss_label, loss_emo print('Training loss:', train_loss / len(train_data_set), end='\t') print('Dev loss:', dev_loss / len(dev_data_set)) # print(classification_report(gold_list, pred_list, target_names=EMOS)) # get_metrics(pred_list, gold_list) if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4: print("Model diverged, retry") is_diverged = True break if es.step(dev_loss): # overfitting print('overfitting, loading best model ...') break else: if es.is_best(): print('saving best model ...') if final_pred_best is not None: del final_pred_best final_pred_best = deepcopy(final_pred_list_test) if pred_list_test_best is not None: del pred_list_test_best pred_list_test_best = deepcopy(pred_list_test) else: print('not best model, ignoring ...') if final_pred_best is None: final_pred_best = deepcopy(final_pred_list_test) if pred_list_test_best is None: pred_list_test_best = deepcopy(pred_list_test) # Gold Dev testing... print('Gold Dev testing....') pred_list_test = [] model.eval() for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c) in enumerate(gold_dev_data_loader): with torch.no_grad(): elmo_a = elmo_encode(a) # , __id2word=ex_id2word elmo_b = elmo_encode(b) elmo_c = elmo_encode(c) pred, _, _ = model(a.cuda(), a_len, b.cuda(), b_len, c.cuda(), c_len, emoji_a.cuda(), emoji_b.cuda(), emoji_c.cuda(), elmo_a, elmo_b, elmo_c) pred_list_test.append(pred.data.cpu().numpy()) del elmo_a, elmo_b, elmo_c, a, b, c, pred pred_list_test = np.argmax(np.concatenate(pred_list_test, axis=0), axis=1) # get_metrics(load_dev_labels('data/dev.txt'), pred_list_test) # Testing print('Gold test testing...') final_pred_list_test = [] model.eval() for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c) in enumerate(test_data_loader): with torch.no_grad(): elmo_a = elmo_encode(a) # , __id2word=ex_id2word elmo_b = elmo_encode(b) elmo_c = elmo_encode(c) pred, _, _ = model(a.cuda(), a_len, b.cuda(), b_len, c.cuda(), c_len, emoji_a.cuda(), emoji_b.cuda(), emoji_c.cuda(), elmo_a, elmo_b, elmo_c) final_pred_list_test.append(pred.data.cpu().numpy()) del elmo_a, elmo_b, elmo_c, a, b, c, pred final_pred_list_test = np.argmax(np.concatenate( final_pred_list_test, axis=0), axis=1) # get_metrics(load_dev_labels('data/test.txt'), final_pred_list_test) if is_diverged: print("Reinitialize model ...") del model continue all_fold_results.append(pred_list_test_best) real_test_results.append(final_pred_best) del model break
def train(X_train, y_train, X_dev, y_dev, X_test, y_test): num_labels = NUM_EMO vocab_size = VOCAB_SIZE print('NUM of VOCAB' + str(vocab_size)) train_data = EmotionDataLoader(X_train, y_train, PAD_LEN) train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True) dev_data = EmotionDataLoader(X_dev, y_dev, PAD_LEN) dev_loader = DataLoader(dev_data, batch_size=int(BATCH_SIZE/3)+2, shuffle=False) test_data = EmotionDataLoader(X_test, y_test, PAD_LEN) test_loader = DataLoader(test_data, batch_size=int(BATCH_SIZE/3)+2, shuffle=False) model = AttentionLSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, vocab_size, num_labels, BATCH_SIZE, att_mode=opt.attention, soft_last=False) model.load_embedding(tokenizer.get_embeddings()) # multi-GPU # model = nn.DataParallel(model) model.cuda() loss_criterion = nn.CrossEntropyLoss() # optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) es = EarlyStopping(patience=PATIENCE) old_model = None for epoch in range(1, 300): print('Epoch: ' + str(epoch) + '===================================') train_loss = 0 model.train() for i, (data, seq_len, label) in tqdm(enumerate(train_loader), total=len(train_data)/BATCH_SIZE): optimizer.zero_grad() y_pred = model(data.cuda(), seq_len) loss = loss_criterion(y_pred, label.view(-1).cuda()) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), CLIPS) optimizer.step() train_loss += loss.data.cpu().numpy() * data.shape[0] del y_pred, loss test_loss = 0 model.eval() for _, (_data, _seq_len, _label) in enumerate(dev_loader): with torch.no_grad(): y_pred = model(_data.cuda(), _seq_len) loss = loss_criterion(y_pred, _label.view(-1).cuda()) test_loss += loss.data.cpu().numpy() * _data.shape[0] del y_pred, loss print("Train Loss: " + str(train_loss / len(train_data)) + \ " Evaluation: " + str(test_loss / len(dev_data))) if es.step(test_loss): # overfitting del model print('overfitting, loading best model ...') model = old_model break else: if es.is_best(): if old_model is not None: del old_model print('saving best model ...') old_model = deepcopy(model) else: print('not best model, ignoring ...') if old_model is None: old_model = deepcopy(model) with open(f'lstm_{opt.dataset}_model.pt', 'bw') as f: torch.save(model.state_dict(), f) pred_list = [] model.eval() for _, (_data, _seq_len, _label) in enumerate(test_loader): with torch.no_grad(): y_pred = model(_data.cuda(), _seq_len) pred_list.append(y_pred.data.cpu().numpy()) # x[np.where( x > 3.0 )] del y_pred pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1) return pred_list