def one_fold(X_train, y_train, X_test, y_test): num_labels = NUM_CLASS vocab_size = 20000 pad_len = 30 batch_size = 100 embedding_dim = 200 hidden_dim = 400 __use_unk = False es = EarlyStop(2) word2id, id2word = build_vocab(X_train, vocab_size, use_unk=__use_unk) train_data = DataSet(X_train, y_train, pad_len, word2id, num_labels, use_unk=__use_unk) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False) test_data = DataSet(X_test, y_test, pad_len, word2id, num_labels, use_unk=__use_unk) test_loader = DataLoader(test_data, batch_size=batch_size) model = AttentionLSTMClassifier(embedding_dim, hidden_dim, vocab_size, word2id, num_labels, batch_size, use_att=True) model.load_glove_embedding(id2word) model.cuda() optimizer = optim.Adam(model.parameters()) loss_criterion = nn.MSELoss() for epoch in range(4): print('Epoch:', epoch, '===================================') train_loss = 0 for i, (data, seq_len, label) in enumerate(train_loader): # print(i) data, label, seq_len = sort_batch(data, label, seq_len.view(-1)) y_pred = model(Variable(data).cuda(), seq_len) optimizer.zero_grad() loss = loss_criterion(y_pred, Variable(label).cuda()) loss.backward() optimizer.step() train_loss += loss.data[0] pred_list = [] gold_list = [] test_loss = 0 for i, (data, seq_len, label) in enumerate(test_loader): data, label, seq_len = sort_batch(data, label, seq_len.view(-1)) y_pred = model(Variable(data, volatile=True).cuda(), seq_len) loss = loss_criterion(y_pred, Variable(label, volatile=True).cuda()) test_loss += loss.data[0] pred_list.append(y_pred.data.cpu().numpy()) gold_list.append(label.numpy()) print("Train Loss: ", train_loss, " Evaluation: ", test_loss) es.new_loss(test_loss) if es.if_stop(): print('Start over fitting') break return np.concatenate(pred_list, axis=0), np.concatenate(gold_list, axis=0)
def one_fold(fold_int, is_nine_folds): fold_id = str(fold_int) if is_nine_folds: fold_path = 'data/Folds_9_Emotions/fold_' + fold_id num_labels = 9 else: fold_path = 'data/Folds/fold_' + fold_id num_labels = 16 pad_len = 30 batch_size = 64 hidden_dim = 400 es = EarlyStop(2) word2id, id2word = build_vocab(fold_path, fold_id, use_unk=True) embedding_dim = len(word2id) vocab_size = len(word2id) train_data = DataSet(os.path.join(fold_path, 'train.csv'), pad_len, word2id, num_labels) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) test_data = DataSet(os.path.join(fold_path, 'test.csv'), pad_len, word2id, num_labels) test_loader = DataLoader(test_data, batch_size=batch_size) model = AttentionLSTMClassifier(embedding_dim, hidden_dim, vocab_size, word2id, num_labels, batch_size, use_att=True) model.load_bog_embedding(word2id) model.cuda() optimizer = optim.Adam( filter(lambda p: p.requires_grad, model.parameters())) loss_criterion = nn.BCELoss() for epoch in range(4): print('Epoch:', epoch, '===================================') train_loss = 0 for i, (data, seq_len, label) in enumerate(train_loader): data, label, seq_len = sort_batch(data, label, seq_len.view(-1)) y_pred = model(Variable(data).cuda(), seq_len) optimizer.zero_grad() loss = loss_criterion(y_pred, Variable(label).cuda()) loss.backward() optimizer.step() train_loss += loss.data[0] pred_list = [] gold_list = [] test_loss = 0 for i, (data, seq_len, label) in enumerate(test_loader): data, label, seq_len = sort_batch(data, label, seq_len.view(-1)) y_pred = model(Variable(data, volatile=True).cuda(), seq_len) loss = loss_criterion(y_pred, Variable(label, volatile=True).cuda()) test_loss += loss.data[0] pred_list.append(y_pred.data.cpu().numpy()) gold_list.append(label.numpy()) print("Train Loss: ", train_loss, " Evaluation: ", test_loss) es.new_loss(test_loss) if es.if_stop(): print('Start over fitting') break return np.concatenate(pred_list, axis=0), np.concatenate(gold_list, axis=0)
def one_fold(X_train, y_train, X_dev, y_dev, class_weight): num_labels = NUM_CLASS vocab_size = 20000 pad_len = 40 batch_size = 64 embedding_dim = 200 hidden_dim = 500 __use_unk = False word2id, id2word = build_vocab(X_train, vocab_size) train_data = DataSet(X_train, y_train, pad_len, word2id, num_labels, use_unk=__use_unk) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) dev_data = DataSet(X_dev, y_dev, pad_len, word2id, num_labels, use_unk=__use_unk) dev_loader = DataLoader(dev_data, batch_size=batch_size, shuffle=False) # test_data = TestDataSet(X_test, pad_len, word2id, num_labels, use_unk=__use_unk) # test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False) model = AttentionLSTMClassifier(embedding_dim, hidden_dim, vocab_size, word2id, num_labels, batch_size, use_att=False) model.load_glove_embedding(id2word) model.cuda() es = EarlyStop(2) optimizer = optim.Adam(model.parameters()) for epoch in range(30): print('Epoch:', epoch, '===================================') train_loss = 0 for i, (data, seq_len, label) in enumerate(train_loader): weight = torch.FloatTensor(class_weight) # re-weight weight_expanded = weight.expand(len(data), -1) loss_criterion = nn.BCELoss(weight=weight_expanded.cuda()) # data, label, seq_len = sort_batch(data, label, seq_len.view(-1)) y_pred = model(Variable(data).cuda(), seq_len) #roc_reward = roc_auc_score(label.numpy().argmax(axis=1), y_pred.data.cpu().numpy()[:, 1]) optimizer.zero_grad() loss = loss_criterion( y_pred, Variable(label).cuda( )) #* Variable(torch.FloatTensor([roc_reward])).cuda() loss.backward() optimizer.step() train_loss += loss.data[0] pred_list = [] gold_list = [] test_loss = 0 for _, (_data, _seq_len, _label) in enumerate(dev_loader): data, label, seq_len = sort_batch(_data, _label, _seq_len.view(-1)) y_pred = model(Variable(data, volatile=True).cuda(), seq_len) weight = torch.FloatTensor(class_weight) # re-weight weight_expanded = weight.expand(len(data), -1) loss_criterion = nn.BCELoss(weight=weight_expanded.cuda()) loss = loss_criterion(y_pred, Variable(label, volatile=True).cuda()) test_loss += loss.data[0] pred_list.append(y_pred.data.cpu().numpy()) gold_list.append(label.numpy()) pred_list_2 = np.concatenate(pred_list, axis=0)[:, 1] pred_list = np.concatenate(pred_list, axis=0).argmax(axis=1) gold_list = np.concatenate(gold_list, axis=0).argmax(axis=1) roc = roc_auc_score(gold_list, pred_list_2) print('roc:', roc) a = accuracy_score(gold_list, pred_list) p = precision_score(gold_list, pred_list, average='binary') r = recall_score(gold_list, pred_list, average='binary') f1 = f1_score(gold_list, pred_list, average='binary') print('accuracy:', a, 'precision_score:', p, 'recall:', r, 'f1:', f1) print("Train Loss: ", train_loss, " Evaluation: ", test_loss) es.new_loss(test_loss) if es.if_stop(): print('Start over fitting') break return gold_list, pred_list
def one_fold(fold_path): vocab_size = 20000 pad_len = 30 batch_size = 64 embedding_dim = 200 hidden_dim = 800 num_labels = NUM_CLASS X, y = cbet_data(os.path.join(fold_path, 'train.csv')) train_index, dev_index = stratified_shuffle_split(X, y) y = np.asarray(y) X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index] y_train, y_dev = y[train_index], y[dev_index] word2id, id2word = build_vocab(X_train, vocab_size) # __X, __y, __pad_len, __word2id, __num_labels train_data = DataSet(X_train, y_train, pad_len, word2id, num_labels) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) dev_data = DataSet(X_dev, y_dev, pad_len, word2id, num_labels) dev_loader = DataLoader(dev_data, batch_size=batch_size, shuffle=True) X_test, y_test = cbet_data(os.path.join(fold_path, 'test.csv')) test_data = DataSet(X_test, y_test, pad_len, word2id, num_labels) test_loader = DataLoader(test_data, batch_size=batch_size) model = AttentionLSTMClassifier(embedding_dim, hidden_dim, vocab_size, word2id, num_labels, batch_size, use_att=True, soft_last=True) model.load_glove_embedding(id2word) model.cuda() optimizer = optim.Adam(model.parameters()) loss_criterion = nn.BCELoss() es = EarlyStop(2) old_model = None for epoch in range(10): print('Epoch:', epoch, '===================================') train_loss = 0 for i, (data, seq_len, label) in enumerate(train_loader): data, label, seq_len = sort_batch(data, label, seq_len.view(-1)) y_pred = model(Variable(data).cuda(), seq_len) optimizer.zero_grad() loss = loss_criterion(y_pred, Variable(label).cuda()) loss.backward() optimizer.step() train_loss += loss.data[0] pred_list = [] gold_list = [] test_loss = 0 # evaluation for i, (data, seq_len, label) in enumerate(dev_loader): data, label, seq_len = sort_batch(data, label, seq_len.view(-1)) y_pred = model(Variable(data, volatile=True).cuda(), seq_len) loss = loss_criterion(y_pred, Variable(label, volatile=True).cuda()) test_loss += loss.data[0] pred_list.append(y_pred.data.cpu().numpy()) gold_list.append(label.numpy()) if old_model is not None: del old_model old_model = copy.deepcopy(model) else: old_model = copy.deepcopy(model) print("Train Loss: ", train_loss, " Evaluation: ", test_loss) es.new_loss(test_loss) if es.if_stop(): print('Start over fitting') del model model = old_model break # testing pred_list = [] gold_list = [] test_loss = 0 for i, (data, seq_len, label) in enumerate(test_loader): data, label, seq_len = sort_batch(data, label, seq_len.view(-1)) y_pred = model(Variable(data, volatile=True).cuda(), seq_len) loss = loss_criterion(y_pred, Variable(label, volatile=True).cuda()) test_loss += loss.data[0] pred_list.append(y_pred.data.cpu().numpy()) gold_list.append(label.numpy()) return np.concatenate(pred_list, axis=0), np.concatenate(gold_list, axis=0)
def one_fold(fold_int, is_nine_folds): fold_id = str(fold_int) if is_nine_folds: fold_path = 'data/Folds_9_Emotions/fold_' + fold_id num_labels = 9 else: fold_path = 'data/Folds/fold_' + fold_id num_labels = 16 vocab_size = 5000 pad_len = 30 batch_size = 64 embedding_dim = 200 hidden_dim = 600 es = EarlyStop(2) word2id, id2word = build_vocab(fold_path, vocab_size, use_unk=True) train_data = DataSet(os.path.join(fold_path, 'train.csv'), pad_len, word2id, num_labels) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) test_data = DataSet(os.path.join(fold_path, 'test.csv'), pad_len, word2id, num_labels) test_loader = DataLoader(test_data, batch_size=batch_size) model = AttentionLSTMClassifier(embedding_dim, hidden_dim, vocab_size, word2id, num_labels, batch_size) model.load_glove_embedding(id2word) model.cuda() optimizer = optim.Adam(model.parameters()) loss_criterion = nn.MSELoss() for epoch in range(4): print('Epoch:', epoch, '===================================') train_loss = 0 for i, (data, seq_len, label) in enumerate(train_loader): data, label, seq_len = sort_batch(data, label, seq_len.view(-1)) y_pred = model(Variable(data).cuda(), seq_len) optimizer.zero_grad() loss = loss_criterion(y_pred, Variable(label).cuda()) loss.backward() optimizer.step() train_loss += loss.data[0] pred_list = [] gold_list = [] test_loss = 0 for i, (data, seq_len, label) in enumerate(test_loader): data, label, seq_len = sort_batch(data, label, seq_len.view(-1)) y_pred = model(Variable(data, volatile=True).cuda(), seq_len) loss = loss_criterion(y_pred, Variable(label, volatile=True).cuda()) test_loss += loss.data[0] pred_list.append(y_pred.data.cpu().numpy()) gold_list.append(label.numpy()) print("Train Loss: ", train_loss, " Evaluation: ", test_loss) es.new_loss(test_loss) if es.if_stop(): print('Start over fitting') break f_ma = [] f_mi = [] for threshold in range(0, 100, 5): threshold /= 100 tmp = CalculateFM(np.concatenate(pred_list, axis=0), np.concatenate(gold_list, axis=0), threshold=threshold) f_ma.append(tmp['MacroFM']) f_mi.append(tmp['MicroFM']) return f_ma, f_mi
def one_fold(X_train, y_train, X_dev, y_dev): num_labels = NUM_CLASS vocab_size = 30000 pad_len = 40 batch_size = 64 embedding_dim = 200 hidden_dim = 600 __use_unk = False word2id, id2word = build_vocab(X_train, vocab_size) train_data = DataSet(X_train, y_train, pad_len, word2id, num_labels, use_unk=__use_unk) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) dev_data = DataSet(X_dev, y_dev, pad_len, word2id, num_labels, use_unk=__use_unk) dev_loader = DataLoader(dev_data, batch_size=batch_size, shuffle=False) # test_data = TestDataSet(X_test, pad_len, word2id, num_labels, use_unk=__use_unk) # test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False) model = AttentionLSTMClassifier(embedding_dim, hidden_dim, vocab_size, word2id, num_labels, batch_size, use_att=True, soft_last=False) model.load_glove_embedding(id2word) model.cuda() es = EarlyStop(2) optimizer = optim.Adam(model.parameters(), lr=1e-5) loss_criterion = nn.MSELoss() # old_model = None for epoch in range(20): print('Epoch:', epoch, '===================================') train_loss = 0 model.train() for i, (data, seq_len, label) in enumerate(train_loader): data, label, seq_len = sort_batch(data, label, seq_len.view(-1)) y_pred = model(Variable(data).cuda(), seq_len) #roc_reward = roc_auc_score(label.numpy().argmax(axis=1), y_pred.data.cpu().numpy()[:, 1]) optimizer.zero_grad() loss = loss_criterion( y_pred, Variable(label).cuda( )) #* Variable(torch.FloatTensor([roc_reward])).cuda() loss.backward() optimizer.step() train_loss += loss.data[0] * batch_size pred_list = [] gold_list = [] test_loss = 0 model.eval() for _, (_data, _seq_len, _label) in enumerate(dev_loader): data, label, seq_len = sort_batch(_data, _label, _seq_len.view(-1)) y_pred = model(Variable(data, volatile=True).cuda(), seq_len) loss = loss_criterion( y_pred, Variable(label).cuda( )) #* Variable(torch.FloatTensor([roc_reward])).cuda() test_loss += loss.data[0] * batch_size y_pred = y_pred.data.cpu().numpy() pred_list.append(y_pred) # x[np.where( x > 3.0 )] gold_list.append(label.numpy()) # pred_list_2 = np.concatenate(pred_list, axis=0)[:, 1] pred_list = np.concatenate(pred_list, axis=0) gold_list = np.concatenate(gold_list, axis=0) # roc = roc_auc_score(gold_list, pred_list_2) # print('roc:', roc) # a = accuracy_score(gold_list, pred_list) # p = precision_score(gold_list, pred_list, average='binary') # r = recall_score(gold_list, pred_list, average='binary') # f1 = f1_score(gold_list, pred_list, average='binary') # print('accuracy:', a, 'precision_score:', p, 'recall:', r, 'f1:', f1) print("Train Loss: ", train_loss / len(train_data), " Evaluation: ", test_loss / len(dev_data)) es.new_loss(test_loss) if old_model is not None: del old_model, old_pred_list old_model = copy.deepcopy(model) old_pred_list = copy.deepcopy(pred_list) else: old_model = copy.deepcopy(model) old_pred_list = copy.deepcopy(pred_list) if es.if_stop(): print('Start over fitting') del model model = old_model pred_list = old_pred_list torch.save(model.state_dict(), open(os.path.join('checkpoint', 'cbet.model'), 'wb')) with open('checkpoint/some_data.pkl', 'wb') as f: pickle.dump([word2id, id2word], f) break return gold_list, pred_list, model, pad_len, word2id, num_labels