def __init__(self, jaccard_thresh=0.5, neg_pos=3, focal=False, device='cpu'): super(MultiBoxLoss, self).__init__() self.jaccard_thresh = jaccard_thresh # 0.5 関数matchのjaccard係数の閾値 self.negpos_ratio = neg_pos # 3:1 Hard Negative Miningの負と正の比率 self.device = device # CPUとGPUのいずれで計算するのか self.floss = focal if focal: from utils.focalloss import FocalLoss self.focal = FocalLoss()
def train(X_train, y_train, X_dev, y_dev, X_test, y_test): num_labels = NUM_EMO vocab_size = VOCAB_SIZE print('NUM of VOCAB' + str(vocab_size)) train_data = EmotionDataLoader(X_train, y_train, PAD_LEN) train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True) dev_data = EmotionDataLoader(X_dev, y_dev, PAD_LEN) dev_loader = DataLoader(dev_data, batch_size=int(BATCH_SIZE / 3) + 2, shuffle=False) test_data = EmotionDataLoader(X_test, y_test, PAD_LEN) test_loader = DataLoader(test_data, batch_size=int(BATCH_SIZE / 3) + 2, shuffle=False) model = AttentionLSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, vocab_size, num_labels, BATCH_SIZE, att_mode=opt.attention, soft_last=False, use_glove=USE_GLOVE, add_linear=ADD_LINEAR, max_pool=MAX_POOLING) if USE_GLOVE: model.load_embedding(tokenizer.get_embeddings()) # multi-GPU # model = nn.DataParallel(model) model.cuda() if opt.loss == 'ce': loss_criterion = nn.CrossEntropyLoss() # print('Using ce loss') elif opt.loss == 'focal': loss_criterion = FocalLoss(gamma=opt.focal, reduce=True) print('Using focal loss, gamma=', opt.focal) else: raise Exception('loss option not recognised') optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) es = EarlyStopping(patience=PATIENCE) old_model = None for epoch in range(1, 300): print('Epoch: ' + str(epoch) + '===================================') train_loss = 0 model.train() for i, (data, seq_len, label) in tqdm(enumerate(train_loader), total=len(train_data) / BATCH_SIZE): optimizer.zero_grad() data_text = [tokenizer.decode_ids(x) for x in data] with torch.no_grad(): character_ids = batch_to_ids(data_text).cuda() elmo_emb = elmo(character_ids)['elmo_representations'] elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2 # avg of two layers emoji_tokenized, _, _ = st.tokenize_sentences( [' '.join(x) for x in data_text]) emoji_encoding = emoji_model( torch.LongTensor(emoji_tokenized.astype(np.int32))) y_pred = model(data.cuda(), seq_len, elmo_emb, emoji_encoding.cuda()) loss = loss_criterion(y_pred, label.view(-1).cuda()) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), CLIPS) optimizer.step() train_loss += loss.data.cpu().numpy() * data.shape[0] del y_pred, loss test_loss = 0 model.eval() for _, (_data, _seq_len, _label) in enumerate(dev_loader): with torch.no_grad(): data_text = [tokenizer.decode_ids(x) for x in _data] character_ids = batch_to_ids(data_text).cuda() elmo_emb = elmo(character_ids)['elmo_representations'] elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2 # avg of two layers emoji_tokenized, _, _ = st.tokenize_sentences( [' '.join(x) for x in data_text]) emoji_encoding = emoji_model( torch.LongTensor(emoji_tokenized.astype(np.int32))) y_pred = model(_data.cuda(), _seq_len, elmo_emb, emoji_encoding.cuda()) loss = loss_criterion(y_pred, _label.view(-1).cuda()) test_loss += loss.data.cpu().numpy() * _data.shape[0] del y_pred, loss print("Train Loss: " + str(train_loss / len(train_data)) + \ " Evaluation: " + str(test_loss / len(dev_data))) if es.step(test_loss): # overfitting del model print('overfitting, loading best model ...') model = old_model break else: if es.is_best(): if old_model is not None: del old_model print('saving best model ...') old_model = deepcopy(model) else: print('not best model, ignoring ...') if old_model is None: old_model = deepcopy(model) with open(f'lstm_elmo_deepmoji_{opt.dataset}_model.pt', 'bw') as f: torch.save(model.state_dict(), f) pred_list = [] model.eval() for _, (_data, _seq_len, _label) in enumerate(test_loader): with torch.no_grad(): data_text = [tokenizer.decode_ids(x) for x in _data] character_ids = batch_to_ids(data_text).cuda() elmo_emb = elmo(character_ids)['elmo_representations'] elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2 # avg of two layers emoji_tokenized, _, _ = st.tokenize_sentences( [' '.join(x) for x in data_text]) emoji_encoding = emoji_model( torch.LongTensor(emoji_tokenized.astype(np.int32))) y_pred = model(_data.cuda(), _seq_len, elmo_emb, emoji_encoding.cuda()) pred_list.append( y_pred.data.cpu().numpy()) # x[np.where( x > 3.0 )] del y_pred pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1) return pred_list
def one_fold(num_fold, train_index, dev_index): print("Training on fold:", num_fold) X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index] y_train, y_dev = y[train_index], y[dev_index] # construct data loader train_data_set = DataSet(X_train, y_train, SENT_PAD_LEN) train_data_loader = DataLoader(train_data_set, batch_size=BATCH_SIZE, shuffle=True) dev_data_set = DataSet(X_dev, y_dev, SENT_PAD_LEN) dev_data_loader = DataLoader(dev_data_set, batch_size=BATCH_SIZE, shuffle=False) gradient_accumulation_steps = 1 num_train_steps = int( len(train_data_set) / BATCH_SIZE / gradient_accumulation_steps * MAX_EPOCH) pred_list_test_best = None final_pred_best = None # This is to prevent model diverge, once happen, retrain while True: is_diverged = False model = BERT_classifer.from_pretrained(BERT_MODEL) model.add_output_layer(BERT_MODEL, NUM_EMO) model = nn.DataParallel(model) model.cuda() # BERT optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=0.1, t_total=num_train_steps) if opt.w == 1: weight_list = [0.3, 0.3, 0.3, 1.7] weight_list_binary = [2 - weight_list[-1], weight_list[-1]] elif opt.w == 2: weight_list = [ 0.3198680179, 0.246494733, 0.2484349259, 1.74527696 ] weight_list_binary = [2 - weight_list[-1], weight_list[-1]] weight_list = [x**FLAT for x in weight_list] weight_label = torch.Tensor(weight_list).cuda() weight_list_binary = [x**FLAT for x in weight_list_binary] weight_binary = torch.Tensor(weight_list_binary).cuda() print('binary loss reweight = weight_list_binary', weight_list_binary) # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary) # if opt.loss == 'focal': loss_criterion = FocalLoss(gamma=opt.focal, reduce=False) loss_criterion_binary = FocalLoss(gamma=opt.focal, reduce=False) # elif opt.loss == 'ce': loss_criterion = nn.CrossEntropyLoss(reduce=False) loss_criterion_binary = nn.CrossEntropyLoss(reduce=False) # loss_criterion_emo_only = nn.MSELoss() # es = EarlyStopping(min_delta=0.005, patience=EARLY_STOP_PATIENCE) es = EarlyStopping(patience=EARLY_STOP_PATIENCE) final_pred_best = None final_pred_list_test = None pred_list_test = None for num_epoch in range(MAX_EPOCH): print('Begin training epoch:', num_epoch) sys.stdout.flush() train_loss = 0 model.train() for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in tqdm(enumerate(train_data_loader), total=len(train_data_set) / BATCH_SIZE): optimizer.zero_grad() if USE_TOKEN_TYPE: pred, pred2, pred3 = model(tokens.cuda(), masks.cuda(), segments.cuda()) else: pred, pred2, pred3 = model(tokens.cuda(), masks.cuda()) loss_label = loss_criterion(pred, e_c.view(-1).cuda()).cuda() loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \ e_c.view(-1).shape[0] loss_binary = loss_criterion_binary( pred2, e_c_binary.view(-1).cuda()).cuda() loss_binary = torch.matmul( torch.gather(weight_binary, 0, e_c_binary.view(-1).cuda()), loss_binary) / e_c.view(-1).shape[0] loss_emo = loss_criterion_emo_only(pred3, e_c_emo.cuda()) loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2) # training trilogy loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP) optimizer.step() train_loss += loss.data.cpu().numpy() * tokens.shape[0] del loss, pred # Evaluate model.eval() dev_loss = 0 # pred_list = [] # gold_list = [] for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in enumerate(dev_data_loader): with torch.no_grad(): if USE_TOKEN_TYPE: pred, pred2, pred3 = model(tokens.cuda(), masks.cuda(), segments.cuda()) else: pred, pred2, pred3 = model(tokens.cuda(), masks.cuda()) loss_label = loss_criterion( pred, e_c.view(-1).cuda()).cuda() loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \ e_c.view(-1).shape[0] loss_binary = loss_criterion_binary( pred2, e_c_binary.view(-1).cuda()).cuda() loss_binary = torch.matmul( torch.gather(weight_binary, 0, e_c_binary.view(-1).cuda()), loss_binary) / e_c.view(-1).shape[0] loss_emo = loss_criterion_emo_only( pred3, e_c_emo.cuda()) loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2) dev_loss += loss.data.cpu().numpy() * tokens.shape[0] # pred_list.append(pred.data.cpu().numpy()) # gold_list.append(e_c.numpy()) del pred, loss # pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1) # gold_list = np.concatenate(gold_list, axis=0) print('Training loss:', train_loss / len(train_data_set), end='\t') print('Dev loss:', dev_loss / len(dev_data_set)) # print(classification_report(gold_list, pred_list, target_names=EMOS)) # get_metrics(pred_list, gold_list) # checking diverge if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4: print("Model diverged, retry") is_diverged = True break if es.step(dev_loss): # overfitting print('overfitting, loading best model ...') if num_epoch == 1: is_diverged = True final_pred_best = deepcopy(final_pred_list_test) pred_list_test_best = deepcopy(pred_list_test) break else: if es.is_best(): print('saving best model ...') if final_pred_best is not None: del final_pred_best final_pred_best = deepcopy(final_pred_list_test) if pred_list_test_best is not None: del pred_list_test_best pred_list_test_best = deepcopy(pred_list_test) else: print('not best model, ignoring ...') if final_pred_best is None: final_pred_best = deepcopy(final_pred_list_test) if pred_list_test_best is None: pred_list_test_best = deepcopy(pred_list_test) print('Gold Dev ...') pred_list_test = [] model.eval() for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in enumerate(gold_dev_data_loader): with torch.no_grad(): if USE_TOKEN_TYPE: pred, _, _ = model(tokens.cuda(), masks.cuda(), segments.cuda()) else: pred, _, _ = model(tokens.cuda(), masks.cuda()) pred_list_test.append(pred.data.cpu().numpy()) pred_list_test = np.argmax(np.concatenate(pred_list_test, axis=0), axis=1) # get_metrics(load_dev_labels('data/dev.txt'), pred_list_test) print('Gold Test ...') final_pred_list_test = [] model.eval() for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in enumerate(gold_test_data_loader): with torch.no_grad(): if USE_TOKEN_TYPE: pred, _, _ = model(tokens.cuda(), masks.cuda(), segments.cuda()) else: pred, _, _ = model(tokens.cuda(), masks.cuda()) final_pred_list_test.append(pred.data.cpu().numpy()) final_pred_list_test = np.argmax(np.concatenate( final_pred_list_test, axis=0), axis=1) # get_metrics(load_dev_labels('data/test.txt'), final_pred_list_test) if is_diverged: print("Reinitialize model ...") del model continue all_fold_results.append(pred_list_test_best) real_test_results.append(final_pred_best) del model break
def one_fold(num_fold, train_index, dev_index): print("Training on fold:", num_fold) X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index] y_train, y_dev = y[train_index], y[dev_index] # construct data loader train_data_set = TrainDataSet(X_train, y_train, CONV_PAD_LEN, SENT_PAD_LEN, word2id, use_unk=True) dev_data_set = TrainDataSet(X_dev, y_dev, CONV_PAD_LEN, SENT_PAD_LEN, word2id, use_unk=True) dev_data_loader = DataLoader(dev_data_set, batch_size=BATCH_SIZE, shuffle=False) # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") pred_list_test_best = None final_pred_best = None # This is to prevent model diverge, once happen, retrain while True: is_diverged = False # Model is defined in HierarchicalPredictor model = HierarchicalPredictor(SENT_EMB_DIM, SENT_HIDDEN_SIZE, num_of_vocab, USE_ELMO=True, ADD_LINEAR=False) model.load_embedding(emb) model.deepmoji_model.load_specific_weights( PRETRAINED_PATH, exclude_names=['output_layer']) model.cuda() # model = nn.DataParallel(model) # model.to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, amsgrad=True) # # optimizer = optim.SGD(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=opt.gamma) if opt.w == 1: weight_list = [0.3, 0.3, 0.3, 1.7] weight_list_binary = [2 - weight_list[-1], weight_list[-1]] elif opt.w == 2: weight_list = [ 0.3198680179, 0.246494733, 0.2484349259, 1.74527696 ] weight_list_binary = [2 - weight_list[-1], weight_list[-1]] else: raise ValueError weight_list = [x**FLAT for x in weight_list] weight_label = torch.Tensor(weight_list).cuda() weight_list_binary = [x**FLAT for x in weight_list_binary] weight_binary = torch.Tensor(weight_list_binary).cuda() print('classification reweight: ', weight_list) print('binary loss reweight = weight_list_binary', weight_list_binary) # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary) # if opt.loss == 'focal': loss_criterion = FocalLoss(gamma=opt.focal, reduce=False) loss_criterion_binary = FocalLoss(gamma=opt.focal, reduce=False) # elif opt.loss == 'ce': loss_criterion = nn.CrossEntropyLoss(reduce=False) loss_criterion_binary = nn.CrossEntropyLoss(reduce=False) # loss_criterion_emo_only = nn.MSELoss() es = EarlyStopping(patience=EARLY_STOP_PATIENCE) # best_model = None final_pred_list_test = None pred_list_test = None for num_epoch in range(MAX_EPOCH): # to ensure shuffle at ever epoch train_data_loader = DataLoader(train_data_set, batch_size=BATCH_SIZE, shuffle=True) print('Begin training epoch:', num_epoch, end='...\t') sys.stdout.flush() # stepping scheduler scheduler.step(num_epoch) print('Current learning rate', scheduler.get_lr()) train_loss = 0 model.train() for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c, e_c, e_c_binary, e_c_emo) \ in tqdm(enumerate(train_data_loader), total=len(train_data_set)/BATCH_SIZE): optimizer.zero_grad() elmo_a = elmo_encode(a) elmo_b = elmo_encode(b) elmo_c = elmo_encode(c) pred, pred2, pred3 = model(a.cuda(), a_len, b.cuda(), b_len, c.cuda(), c_len, emoji_a.cuda(), emoji_b.cuda(), emoji_c.cuda(), elmo_a, elmo_b, elmo_c) loss_label = loss_criterion(pred, e_c.view(-1).cuda()).cuda() loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \ e_c.view(-1).shape[0] loss_binary = loss_criterion_binary( pred2, e_c_binary.view(-1).cuda()).cuda() loss_binary = torch.matmul( torch.gather(weight_binary, 0, e_c_binary.view(-1).cuda()), loss_binary) / e_c.view(-1).shape[0] loss_emo = loss_criterion_emo_only(pred3, e_c_emo.cuda()) loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2) # loss = torch.matmul(torch.gather(weight, 0, trg.view(-1).cuda()), loss) / trg.view(-1).shape[0] # training trilogy loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP) optimizer.step() train_loss += loss.data.cpu().numpy() * a.shape[0] del pred, loss, elmo_a, elmo_b, elmo_c, e_c_emo, loss_binary, loss_label, loss_emo # Evaluate model.eval() dev_loss = 0 # pred_list = [] # gold_list = [] for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c, e_c, e_c_binary, e_c_emo)\ in enumerate(dev_data_loader): with torch.no_grad(): elmo_a = elmo_encode(a) elmo_b = elmo_encode(b) elmo_c = elmo_encode(c) pred, pred2, pred3 = model(a.cuda(), a_len, b.cuda(), b_len, c.cuda(), c_len, emoji_a.cuda(), emoji_b.cuda(), emoji_c.cuda(), elmo_a, elmo_b, elmo_c) loss_label = loss_criterion( pred, e_c.view(-1).cuda()).cuda() loss_label = torch.matmul( torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / e_c.view(-1).shape[0] loss_binary = loss_criterion_binary( pred2, e_c_binary.view(-1).cuda()).cuda() loss_binary = torch.matmul( torch.gather(weight_binary, 0, e_c_binary.view(-1).cuda()), loss_binary) / e_c.view(-1).shape[0] loss_emo = loss_criterion_emo_only( pred3, e_c_emo.cuda()) loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2) dev_loss += loss.data.cpu().numpy() * a.shape[0] # pred_list.append(pred.data.cpu().numpy()) # gold_list.append(e_c.numpy()) del pred, loss, elmo_a, elmo_b, elmo_c, e_c_emo, loss_binary, loss_label, loss_emo print('Training loss:', train_loss / len(train_data_set), end='\t') print('Dev loss:', dev_loss / len(dev_data_set)) # print(classification_report(gold_list, pred_list, target_names=EMOS)) # get_metrics(pred_list, gold_list) if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4: print("Model diverged, retry") is_diverged = True break if es.step(dev_loss): # overfitting print('overfitting, loading best model ...') break else: if es.is_best(): print('saving best model ...') if final_pred_best is not None: del final_pred_best final_pred_best = deepcopy(final_pred_list_test) if pred_list_test_best is not None: del pred_list_test_best pred_list_test_best = deepcopy(pred_list_test) else: print('not best model, ignoring ...') if final_pred_best is None: final_pred_best = deepcopy(final_pred_list_test) if pred_list_test_best is None: pred_list_test_best = deepcopy(pred_list_test) # Gold Dev testing... print('Gold Dev testing....') pred_list_test = [] model.eval() for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c) in enumerate(gold_dev_data_loader): with torch.no_grad(): elmo_a = elmo_encode(a) # , __id2word=ex_id2word elmo_b = elmo_encode(b) elmo_c = elmo_encode(c) pred, _, _ = model(a.cuda(), a_len, b.cuda(), b_len, c.cuda(), c_len, emoji_a.cuda(), emoji_b.cuda(), emoji_c.cuda(), elmo_a, elmo_b, elmo_c) pred_list_test.append(pred.data.cpu().numpy()) del elmo_a, elmo_b, elmo_c, a, b, c, pred pred_list_test = np.argmax(np.concatenate(pred_list_test, axis=0), axis=1) # get_metrics(load_dev_labels('data/dev.txt'), pred_list_test) # Testing print('Gold test testing...') final_pred_list_test = [] model.eval() for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c) in enumerate(test_data_loader): with torch.no_grad(): elmo_a = elmo_encode(a) # , __id2word=ex_id2word elmo_b = elmo_encode(b) elmo_c = elmo_encode(c) pred, _, _ = model(a.cuda(), a_len, b.cuda(), b_len, c.cuda(), c_len, emoji_a.cuda(), emoji_b.cuda(), emoji_c.cuda(), elmo_a, elmo_b, elmo_c) final_pred_list_test.append(pred.data.cpu().numpy()) del elmo_a, elmo_b, elmo_c, a, b, c, pred final_pred_list_test = np.argmax(np.concatenate( final_pred_list_test, axis=0), axis=1) # get_metrics(load_dev_labels('data/test.txt'), final_pred_list_test) if is_diverged: print("Reinitialize model ...") del model continue all_fold_results.append(pred_list_test_best) real_test_results.append(final_pred_best) del model break
def one_fold(num_fold, train_index, dev_index): print("Training on fold:", num_fold) X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index] y_train, y_dev = y[train_index], y[dev_index] # construct data loader # for one fold, test data comes from k fold split. train_data_set = TrainDataSet(X_train, y_train, EMAI_PAD_LEN, SENT_PAD_LEN, word2id, use_unk=True) dev_data_set = TrainDataSet(X_dev, y_dev, EMAI_PAD_LEN, SENT_PAD_LEN, word2id, use_unk=True) dev_data_loader = DataLoader(dev_data_set, batch_size=BATCH_SIZE, shuffle=False) # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") final_pred_best = None # This is to prevent model diverge, once happen, retrain while True: is_diverged = False # Model is defined in HierarchicalPredictor model = HierarchicalAttPredictor(SENT_EMB_DIM, SENT_HIDDEN_SIZE, CTX_LSTM_DIM, num_of_vocab, SENT_PAD_LEN, id2word, USE_ELMO=True, ADD_LINEAR=False) model.load_embedding(emb) model.deepmoji_model.load_specific_weights( PRETRAINED_PATH, exclude_names=['output_layer']) model.cuda() # model = nn.DataParallel(model) # model.to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, amsgrad=True) # # optimizer = optim.SGD(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=GAMMA) # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary) # if opt.loss == 'focal': loss_criterion = FocalLoss(gamma=opt.focal) elif opt.loss == 'ce': loss_criterion = nn.BCELoss() es = EarlyStopping(patience=EARLY_STOP_PATIENCE) final_pred_list_test = None result_print = {} for num_epoch in range(MAX_EPOCH): # to ensure shuffle at ever epoch train_data_loader = DataLoader(train_data_set, batch_size=BATCH_SIZE, shuffle=True) print('Begin training epoch:', num_epoch, end='...\t') sys.stdout.flush() # stepping scheduler scheduler.step(num_epoch) print('Current learning rate', scheduler.get_lr()) ## Training step train_loss = 0 model.train() for i, (a, a_len, emoji_a, e_c) \ in tqdm(enumerate(train_data_loader), total=len(train_data_set)/BATCH_SIZE): optimizer.zero_grad() e_c = e_c.type(torch.float) pred = model(a.cuda(), a_len, emoji_a.cuda()) loss_label = loss_criterion(pred.squeeze(1), e_c.view(-1).cuda()).cuda() # training trilogy loss_label.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP) optimizer.step() train_loss += loss_label.data.cpu().numpy() * a.shape[0] del pred, loss_label ## Evaluatation step model.eval() dev_loss = 0 # pred_list = [] for i, (a, a_len, emoji_a, e_c) in enumerate(dev_data_loader): with torch.no_grad(): e_c = e_c.type(torch.float) pred = model(a.cuda(), a_len, emoji_a.cuda()) loss_label = loss_criterion( pred.squeeze(1), e_c.view(-1).cuda()).cuda() dev_loss += loss_label.data.cpu().numpy() * a.shape[0] # pred_list.append(pred.data.cpu().numpy()) # gold_list.append(e_c.numpy()) del pred, loss_label print('Training loss:', train_loss / len(train_data_set), end='\t') print('Dev loss:', dev_loss / len(dev_data_set)) # print(classification_report(gold_list, pred_list, target_names=EMOS)) # get_metrics(pred_list, gold_list) # Gold Test testing print('Final test testing...') final_pred_list_test = [] model.eval() for i, (a, a_len, emoji_a) in enumerate(final_test_data_loader): with torch.no_grad(): pred = model(a.cuda(), a_len, emoji_a.cuda()) final_pred_list_test.append(pred.data.cpu().numpy()) del a, pred print("final_pred_list_test", len(final_pred_list_test)) final_pred_list_test = np.concatenate(final_pred_list_test, axis=0) final_pred_list_test = np.squeeze(final_pred_list_test, axis=1) print("final_pred_list_test_concat", len(final_pred_list_test)) accuracy, precision, recall, f1 = get_metrics( np.asarray(final_test_target_list), np.asarray(final_pred_list_test)) result_print.update( {num_epoch: [accuracy, precision, recall, f1]}) if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4: print("Model diverged, retry") is_diverged = True break if es.step(dev_loss): # overfitting print('overfitting, loading best model ...') break else: if es.is_best(): print('saving best model ...') if final_pred_best is not None: del final_pred_best final_pred_best = deepcopy(final_pred_list_test) else: print('not best model, ignoring ...') if final_pred_best is None: final_pred_best = deepcopy(final_pred_list_test) with open(result_path, 'wb') as w: pkl.dump(result_print, w) if is_diverged: print("Reinitialize model ...") del model continue real_test_results.append(np.asarray(final_pred_best)) # saving model for inference torch.save(model.state_dict(), opt.out_path) del model break
# setting of train phase model = PCB(class_num=4768) # load pretrained para without classifier if use_gpu: model = model.cuda() # set the criterion triplet_selector = SemihardNegativeTripletSelector(opt.margin) criterion_tri = OnlineTripletLoss(opt.margin, triplet_selector) criterion_part = nn.CrossEntropyLoss() # criterion_part=CrossEntropyLabelSmooth(4768) criterion_center = CenterLoss(4768) criterion_focal = FocalLoss(gamma=2) # updating rule for parameter ignored_params = list(map(id, model.model.fc.parameters())) ignored_params += ( list(map(id, model.classifier0.parameters())) + list(map(id, model.classifier1.parameters())) + list(map(id, model.classifier2.parameters())) + list(map(id, model.classifier3.parameters())) + list(map(id, model.classifier4.parameters())) + list(map(id, model.classifier5.parameters())) + list(map(id, model.classifier6.parameters())) # +list(map(id, model.classifier7.parameters() )) ) base_params = filter(lambda p: id(p) not in ignored_params, model.parameters()) optimizer_ft = optim.SGD(
def train(args): # step0: parse config best_acc = 0 new_config = { "model": args.model, "num_workers": args.num_workers, "batch_size": args.batch_size, "load_model_path": args.load_model_path } opt.parse(new_config) # step1:model model = getattr(models, opt.model)() # step2: data dataset = getattr(datasets, opt.dataset) train_data = dataset(opt.train_data_root, train=True) val_data = dataset(opt.train_data_root, train=False) train_dataloader = DataLoader(train_data, opt.batch_size, pin_memory=True, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, pin_memory=True, shuffle=False, num_workers=opt.num_workers) # step3: criterion and optimizer #criterion = torch.nn.CrossEntropyLoss() criterion = FocalLoss(gamma=2.0) lr = opt.lr optimizer = torch.optim.Adam(model.parameters(), opt.lr, weight_decay=opt.weight_decay) # step4: meters loss_meter = meter.AverageValueMeter() acc_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e10 if opt.load_model_path is None: opt.load_model_path = get_lastest_model(prefix=opt.model) if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model).cuda() else: model.to(opt.device) if opt.load_model_path: model.load_state_dict(torch.load(opt.load_model_path)) model.eval() _, best_acc = val(model, val_dataloader) logging.info("Resuming from " + opt.load_model_path + " with acc: " + str(best_acc)) prefix = 'output/' + opt.model # train for epoch in range(opt.max_epoch): model.train() loss_meter.reset() acc_meter.reset() confusion_matrix.reset() nIters = len(train_dataloader) pbar = tqdm(train_dataloader) start = time.time() for iter, (data, label) in enumerate(pbar): # train model input = data.to(opt.device) target = label.to(opt.device) optimizer.zero_grad() y_pred = model(input) loss = criterion(y_pred, target) prec1 = accuracy(y_pred.data, target) loss.backward() optimizer.step() # meters update loss_meter.add(loss.item()) acc_meter.add(prec1[0].item()) confusion_matrix.add(y_pred.detach(), target.detach()) if sys.stderr.isatty(): log_str = "{epoch}: Loss:{loss.val:.5f} Acc:{acc.val:.3f}".format( epoch=epoch, loss=loss_meter, acc=acc_meter) pbar.set_description(log_str) else: if iter % opt.print_freq == 0: log_str = "{iter}/{len}: Loss:{loss.val:.5f} Acc:{acc.val:.3f}".format( iter=iter, len=nIters, loss=loss_meter, acc=acc_meter) logging.info(log_str) logging.info(log_str) # validate and visualize end = time.time() if not sys.stderr.isatty(): logging.info(str(epoch) + ": time " + str(end - start) + "s") val_cm, val_accuracy = val(model, val_dataloader) if val_accuracy > best_acc: best_acc = val_accuracy #name = time.strftime(prefix + '_%m%d_%H:%M:%S.pth') name = prefix + "_best.pth" torch.save(model.state_dict(), name) torch.save(model.state_dict(), prefix + "_last.pth") logging.info("Val {epoch}: Loss: {loss},Acc: {acc},lr: {lr}".format( epoch=epoch, acc=val_accuracy, loss=loss_meter.value()[0], lr=lr)) #logging.info("confusion_matrix:{val_cm}".format(val_cm = str(val_cm.value()))) # update learning rate if loss_meter.value()[0] > previous_loss: if lr > 1e-5: lr = lr * opt.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
weight_decay=weigth_decay) elif optim_type.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=lr, weigth_decay=weigth_decay) else: raise Exception('Other optimizer is not supported') if loss_type.lower() == 'cross_entropy': if torch.cuda.is_available(): criterion = nn.CrossEntropyLoss().cuda() else: criterion = nn.CrossEntropyLoss() elif loss_type.lower() == 'focal': if torch.cuda.is_available(): criterion = FocalLoss(n_label).cuda() else: criterion = FocalLoss(n_label) elif loss_type.lower() == 'binary_cross_entropy': if torch.cuda.is_available(): criterion = nn.BCEWithLogitsLoss().cuda() else: criterion = nn.BCEWithLogitsLoss() else: raise Exception('%s loss is not supported' % loss_type.lower()) print_progress('TRAIN coinfg Done') else: raise Exception('TRAIN should be configured in config file') else: if 'DECODE' in sessions: print_progress('Start DECODE config')
model.backbone.apply(weights_init) # TODO@LYC: Init Header # model.head.apply(weights_init_without_kaiming) # not very effective print("model initiated without pretrain") for p in model.parameters(): p.requires_grad = True print("\tLearning Rate:", LEARNING_RATE) print("\tBatch Size:", batch_size) print() optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) scheduler = WarmupMultiStepLR(optimizer, milestones=[], warmup_iters=len(train_loader)) criterion = FocalLoss() USE_NORMAL_LOSS = True # 截断大loss for epoch in range(max_epochs): if USE_NORMAL_LOSS: normal_loss = -1 for it, images in enumerate(train_loader): layout_image = images[0].to(device) heat_image = images[1].to(device) m = model(heat_image) # loss a= F.binary_cross_entropy(m, layout_image, reduction="mean") loss = criterion(m, layout_image) if USE_NORMAL_LOSS and epoch > 0: