def test(dim, args): import torch import numpy as np from features import ExtractWordEmbeddings from preprocess_data import batchify, padBatch from models.lstm import LSTMClassifier from sklearn.utils import shuffle from sklearn.metrics import roc_auc_score, recall_score, accuracy_score # hyperparameters is_cuda = True batch_size = 60 embedding_dim = 300 hidden_dim = args.hidden_dim weight_dir = 'weights/LSTM/%s' % dim weight_file = join(weight_dir, 'best-weights.pth') assert os.path.exists( weight_file), "The file directory for the saved model doesn't exist" # load datasets X_t, y_t = loadDatasetForLSTM(dim, 'test') # load model and settings for training model = LSTMClassifier(embedding_dim=embedding_dim, hidden_dim=hidden_dim) state_dict = torch.load(weight_file) model.load_state_dict(state_dict) if is_cuda: model.cuda() em = ExtractWordEmbeddings(emb_type='glove') # validate y_scores = [] X_t, y_t = shuffle(X_t, y_t) val_batches = batchify(X_t, y_t, batch_size) model.eval() with torch.no_grad(): for X_b, y_b in val_batches: inputs = torch.tensor( padBatch([ em.obtain_vectors_from_sentence(sent, True) for sent in X_b ])).float() targets = torch.tensor(y_b, dtype=torch.float32) if is_cuda: inputs, targets = inputs.cuda(), targets.cuda() outputs = model(inputs).tolist() y_scores.extend(outputs) y_preds = np.array(np.array(y_scores) >= 0.5, dtype=int) auc = roc_auc_score(y_true=y_t, y_score=y_scores) rec = recall_score(y_true=y_t, y_pred=y_preds) acc = accuracy_score(y_true=y_t, y_pred=y_preds) print('AUC: ', round(auc, 2)) print('REC: ', round(rec, 2)) print('ACC: ', round(acc, 2)) with open(join(weight_dir, 'scores.txt'), 'w') as f: f.write('AUC: %1.2f\n' % auc) f.write('REC: %1.2f\n' % rec) f.write('ACC: %1.2f\n' % acc) return
class LSTMTrainer(Trainer): def __init__(self, opt, emb_matrix=None): self.opt = opt self.emb_matrix = emb_matrix self.model = LSTMClassifier(opt, emb_matrix=emb_matrix) self.criterion = nn.CrossEntropyLoss() self.parameters = [p for p in self.model.parameters() if p.requires_grad] if opt['cuda']: self.model.cuda() self.criterion.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr']) def update(self, batch): inputs, labels = unpack_batch(batch) # Step 1 init and forward self.model.train() self.optimizer.zero_grad() logits = self.model(inputs) loss = self.criterion(logits, labels) loss_val = loss.item() # Step 2 backward loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.opt['max_grad_norm']) # Step 3 update self.optimizer.step() return loss_val def predict(self, batch, unsort=True): inputs, labels = unpack_batch(batch) self.model.eval() logits = self.model(inputs) loss = self.criterion(logits, labels) loss_val = loss.item() probs = F.softmax(logits, 1).data.cpu().numpy().tolist() predictions = np.argmax(logits.data.cpu().numpy(), axis=1).tolist() labels = labels.data.cpu().numpy().tolist() return predictions, probs, labels, loss_val
def train(dim, args): import torch from torch import nn, optim import numpy as np from features import ExtractWordEmbeddings from preprocess_data import batchify, padBatch from models.lstm import LSTMClassifier from sklearn.utils import shuffle # hyperparameters embedding_dim = 300 # changes only with different word embeddings hidden_dim = args.hidden_dim max_epochs = args.max_epochs is_cuda = True batch_size = 60 lr = args.lr n_decreases = 10 save_dir = 'weights/LSTM/%s' % dim if not os.path.exists(save_dir): os.makedirs(save_dir) """ Loading train / validation datasets X_tr: a list of tokenized sentences y_tr: a list of 0 and 1 """ X_tr, y_tr = loadDatasetForLSTM(dim, 'train') # a list of tokenized sentences X_d, y_d = loadDatasetForLSTM(dim, 'dev') # load model and settings for training model = LSTMClassifier(embedding_dim=embedding_dim, hidden_dim=hidden_dim) if is_cuda: model.cuda() optimizer = optim.AdamW(model.parameters(), lr=lr) flag = True old_val = np.inf # previous validation error em = ExtractWordEmbeddings(emb_type='glove') loss_fn = nn.BCELoss() # train model epoch = 0 cnt_decrease = 0 while (flag): tr_loss = 0.0 epoch += 1 if (epoch > max_epochs) | (cnt_decrease > n_decreases): break # train model.train() # for each iteration, shuffles X_tr and y_tr and puts them into batches X_tr, y_tr = shuffle(X_tr, y_tr) tr_batches = batchify(X_tr, y_tr, batch_size) for X_b, y_b in tr_batches: # X_b is still a list of tokenized sentences (list of list of words) optimizer.zero_grad() """ obtain_vectors_from_sentence(sent=list of words, include_unk=True) : changes each word into an embedding, and returns a list of embeddings padBatch(list of embedding lists, max_seq=None) : for each batch, returns a tensor fixed to the max size, applies zero padding """ inputs = torch.tensor( padBatch([ em.obtain_vectors_from_sentence(sent, True) for sent in X_b ])).float() # here, inputs become a tensor of shape (B * seq_len * dim) targets = torch.tensor(y_b, dtype=torch.float32) if is_cuda: inputs, targets = inputs.cuda(), targets.cuda() outputs = model(inputs) loss = loss_fn(outputs, targets) # error here loss.backward() tr_loss += loss.item() optimizer.step() print("[Epoch %d] train loss: %1.3f" % (epoch, tr_loss)) # validate model.eval() current_loss = 0.0 X_d, y_d = shuffle(X_d, y_d) val_batches = batchify(X_d, y_d, batch_size) with torch.no_grad(): for X_b, y_b in val_batches: inputs = torch.tensor( padBatch([ em.obtain_vectors_from_sentence(sent, True) for sent in X_b ])).float() targets = torch.tensor(y_b, dtype=torch.float32) if is_cuda: inputs, targets = inputs.cuda(), targets.cuda() outputs = model(inputs) loss = loss_fn(outputs, targets) # error here current_loss += loss.item() print("[Epoch %d] validation loss: %1.3f" % (epoch, current_loss)) if current_loss < old_val: # if current round is better than the previous round best_state = model.state_dict() # save this model torch.save(best_state, join(save_dir, 'best-weights.pth')) print("Updated model") old_val = current_loss cnt_decrease = 0 else: # if the current round is doing worse cnt_decrease += 1 if cnt_decrease >= n_decreases: flag = False return
def __init__(self, models_dir='./models/lstm_trained_models', embeddings_dir='./embeddings', is_cuda=False): """ @param models_dir: the directory where the LSTM models are stored @param embeddings_dir: the directory where the embeddings are stored. The directory must contain the following subdirectories: word2vec/GoogleNews-vectors-negative300.wv fasttext/wiki-news-300d-1M-subword.wv glove/glove.42B.300d.wv @param is_cuda: to enable cuda """ self.is_cuda = is_cuda self.models_dir = models_dir self.embeddings_dir = embeddings_dir #load embeddings self.em_glove = ExtractWordEmbeddings('glove', emb_dir=self.embeddings_dir) self.em_word2vec = ExtractWordEmbeddings('word2vec', emb_dir=self.embeddings_dir) self.em_fasttext = ExtractWordEmbeddings('fasttext', emb_dir=self.embeddings_dir) self.dimensions_list = [ 'support', 'knowledge', 'conflict', 'power', 'similarity', 'fun', 'status', 'trust', 'identity', 'romance' ] #load models self.dim2model = {} self.dim2embedding = {} for dim in self.dimensions_list: model = LSTMClassifier(embedding_dim=300, hidden_dim=300) if self.is_cuda: print(f'Torch version: {torch.__version__}') print(f'Torch CUDA available : {torch.cuda.is_available()}') if torch.cuda.is_available(): print( f'Torch current device : {torch.cuda.current_device()}' ) print(f'Torch device count : {torch.cuda.device_count()}') print( f'Torch device name : {torch.cuda.get_device_name(0)}') model.cuda() else: print( 'Cuda not available. Instantiated the TenDimensionsClassifier with CUDA=False' ) self.is_cuda = False model.eval() for modelname in os.listdir(self.models_dir): if ('-best.lstm' in modelname) & (dim in modelname): best_state = torch.load(join(self.models_dir, modelname), map_location='cpu') model.load_state_dict(best_state) if 'glove' in modelname: em = self.em_glove elif 'word2vec' in modelname: em = self.em_word2vec elif 'fasttext' in modelname: em = self.em_fasttext self.dim2model[dim] = model self.dim2embedding[dim] = em break