示例#1
0
def train(X_train, y_train, X_dev, y_dev, X_test, y_test):
    num_labels = NUM_EMO

    vocab_size = VOCAB_SIZE

    print('NUM of VOCAB' + str(vocab_size))
    train_data = EmotionDataLoader(X_train, y_train, PAD_LEN)
    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

    dev_data = EmotionDataLoader(X_dev, y_dev, PAD_LEN)
    dev_loader = DataLoader(dev_data,
                            batch_size=int(BATCH_SIZE / 3) + 2,
                            shuffle=False)

    test_data = EmotionDataLoader(X_test, y_test, PAD_LEN)
    test_loader = DataLoader(test_data,
                             batch_size=int(BATCH_SIZE / 3) + 2,
                             shuffle=False)

    model = AttentionLSTMClassifier(EMBEDDING_DIM,
                                    HIDDEN_DIM,
                                    vocab_size,
                                    num_labels,
                                    BATCH_SIZE,
                                    att_mode=opt.attention,
                                    soft_last=False,
                                    use_glove=USE_GLOVE,
                                    add_linear=ADD_LINEAR,
                                    max_pool=MAX_POOLING)

    if USE_GLOVE:
        model.load_embedding(tokenizer.get_embeddings())
    # multi-GPU
    # model = nn.DataParallel(model)
    model.cuda()

    if opt.loss == 'ce':
        loss_criterion = nn.CrossEntropyLoss()  #
        print('Using ce loss')
    elif opt.loss == 'focal':
        loss_criterion = FocalLoss(gamma=opt.focal, reduce=True)
        print('Using focal loss, gamma=', opt.focal)
    else:
        raise Exception('loss option not recognised')

    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    es = EarlyStopping(patience=PATIENCE)

    old_model = None
    for epoch in range(1, 300):
        print('Epoch: ' + str(epoch) + '===================================')
        train_loss = 0
        model.train()
        for i, (data, seq_len,
                label) in tqdm(enumerate(train_loader),
                               total=len(train_data) / BATCH_SIZE):
            optimizer.zero_grad()

            data_text = [tokenizer.decode_ids(x) for x in data]
            with torch.no_grad():
                character_ids = batch_to_ids(data_text).cuda()
                elmo_emb = elmo(character_ids)['elmo_representations']
                elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2  # avg of two layers

                emoji_tokenized, _, _ = st.tokenize_sentences(
                    [' '.join(x) for x in data_text])
                emoji_encoding = emoji_model(
                    torch.LongTensor(emoji_tokenized.astype(np.int32)))

            y_pred = model(data.cuda(), seq_len, elmo_emb,
                           emoji_encoding.cuda())
            loss = loss_criterion(y_pred, label.view(-1).cuda())
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIPS)
            optimizer.step()
            train_loss += loss.data.cpu().numpy() * data.shape[0]
            del y_pred, loss

        test_loss = 0
        model.eval()
        for _, (_data, _seq_len, _label) in enumerate(dev_loader):
            with torch.no_grad():

                data_text = [tokenizer.decode_ids(x) for x in _data]
                character_ids = batch_to_ids(data_text).cuda()
                elmo_emb = elmo(character_ids)['elmo_representations']
                elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2  # avg of two layers

                emoji_tokenized, _, _ = st.tokenize_sentences(
                    [' '.join(x) for x in data_text])
                emoji_encoding = emoji_model(
                    torch.LongTensor(emoji_tokenized.astype(np.int32)))

                y_pred = model(_data.cuda(), _seq_len, elmo_emb,
                               emoji_encoding.cuda())
                loss = loss_criterion(y_pred, _label.view(-1).cuda())
                test_loss += loss.data.cpu().numpy() * _data.shape[0]
                del y_pred, loss

        print("Train Loss: " + str(train_loss / len(train_data)) + \
              " Evaluation: " + str(test_loss / len(dev_data)))

        if es.step(test_loss):  # overfitting
            del model
            print('overfitting, loading best model ...')
            model = old_model
            break
        else:
            if es.is_best():
                if old_model is not None:
                    del old_model
                print('saving best model ...')
                old_model = deepcopy(model)
            else:
                print('not best model, ignoring ...')
                if old_model is None:
                    old_model = deepcopy(model)

    with open(f'lstm_elmo_deepmoji_{opt.dataset}_model.pt', 'bw') as f:
        torch.save(model.state_dict(), f)

    pred_list = []
    model.eval()
    for _, (_data, _seq_len, _label) in enumerate(test_loader):
        with torch.no_grad():
            data_text = [tokenizer.decode_ids(x) for x in _data]
            character_ids = batch_to_ids(data_text).cuda()
            elmo_emb = elmo(character_ids)['elmo_representations']
            elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2  # avg of two layers

            emoji_tokenized, _, _ = st.tokenize_sentences(
                [' '.join(x) for x in data_text])
            emoji_encoding = emoji_model(
                torch.LongTensor(emoji_tokenized.astype(np.int32)))

            y_pred = model(_data.cuda(), _seq_len, elmo_emb,
                           emoji_encoding.cuda())
            pred_list.append(
                y_pred.data.cpu().numpy())  # x[np.where( x > 3.0 )]
            del y_pred

    pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1)

    return pred_list
示例#2
0
    def one_fold(num_fold, train_index, dev_index):
        print("Training on fold:", num_fold)
        X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index]
        y_train, y_dev = y[train_index], y[dev_index]

        # construct data loader
        train_data_set = DataSet(X_train, y_train, SENT_PAD_LEN)
        train_data_loader = DataLoader(train_data_set,
                                       batch_size=BATCH_SIZE,
                                       shuffle=True)

        dev_data_set = DataSet(X_dev, y_dev, SENT_PAD_LEN)
        dev_data_loader = DataLoader(dev_data_set,
                                     batch_size=BATCH_SIZE,
                                     shuffle=False)
        gradient_accumulation_steps = 1
        num_train_steps = int(
            len(train_data_set) / BATCH_SIZE / gradient_accumulation_steps *
            MAX_EPOCH)

        pred_list_test_best = None
        final_pred_best = None
        # This is to prevent model diverge, once happen, retrain
        while True:
            is_diverged = False
            model = BERT_classifer.from_pretrained(BERT_MODEL)
            model.add_output_layer(BERT_MODEL, NUM_EMO)
            model = nn.DataParallel(model)
            model.cuda()

            # BERT optimizer
            param_optimizer = list(model.named_parameters())
            no_decay = ['bias', 'gamma', 'beta']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay_rate':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay_rate':
                0.0
            }]

            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=learning_rate,
                                 warmup=0.1,
                                 t_total=num_train_steps)

            if opt.w == 1:
                weight_list = [0.3, 0.3, 0.3, 1.7]
                weight_list_binary = [2 - weight_list[-1], weight_list[-1]]
            elif opt.w == 2:
                weight_list = [
                    0.3198680179, 0.246494733, 0.2484349259, 1.74527696
                ]
                weight_list_binary = [2 - weight_list[-1], weight_list[-1]]

            weight_list = [x**FLAT for x in weight_list]
            weight_label = torch.Tensor(weight_list).cuda()

            weight_list_binary = [x**FLAT for x in weight_list_binary]
            weight_binary = torch.Tensor(weight_list_binary).cuda()
            print('binary loss reweight = weight_list_binary',
                  weight_list_binary)
            # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary)  #
            if opt.loss == 'focal':
                loss_criterion = FocalLoss(gamma=opt.focal, reduce=False)
                loss_criterion_binary = FocalLoss(gamma=opt.focal,
                                                  reduce=False)  #
            elif opt.loss == 'ce':
                loss_criterion = nn.CrossEntropyLoss(reduce=False)
                loss_criterion_binary = nn.CrossEntropyLoss(reduce=False)  #

            loss_criterion_emo_only = nn.MSELoss()

            # es = EarlyStopping(min_delta=0.005, patience=EARLY_STOP_PATIENCE)
            es = EarlyStopping(patience=EARLY_STOP_PATIENCE)
            final_pred_best = None
            final_pred_list_test = None
            pred_list_test = None
            for num_epoch in range(MAX_EPOCH):
                print('Begin training epoch:', num_epoch)
                sys.stdout.flush()
                train_loss = 0
                model.train()
                for i, (tokens, masks, segments, e_c, e_c_binary,
                        e_c_emo) in tqdm(enumerate(train_data_loader),
                                         total=len(train_data_set) /
                                         BATCH_SIZE):
                    optimizer.zero_grad()

                    if USE_TOKEN_TYPE:
                        pred, pred2, pred3 = model(tokens.cuda(), masks.cuda(),
                                                   segments.cuda())
                    else:
                        pred, pred2, pred3 = model(tokens.cuda(), masks.cuda())

                    loss_label = loss_criterion(pred,
                                                e_c.view(-1).cuda()).cuda()
                    loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \
                                 e_c.view(-1).shape[0]

                    loss_binary = loss_criterion_binary(
                        pred2,
                        e_c_binary.view(-1).cuda()).cuda()
                    loss_binary = torch.matmul(
                        torch.gather(weight_binary, 0,
                                     e_c_binary.view(-1).cuda()),
                        loss_binary) / e_c.view(-1).shape[0]

                    loss_emo = loss_criterion_emo_only(pred3, e_c_emo.cuda())

                    loss = (loss_label + LAMBDA1 * loss_binary +
                            LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2)

                    # training trilogy
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
                    optimizer.step()

                    train_loss += loss.data.cpu().numpy() * tokens.shape[0]

                    del loss, pred

                # Evaluate
                model.eval()
                dev_loss = 0
                # pred_list = []
                # gold_list = []
                for i, (tokens, masks, segments, e_c, e_c_binary,
                        e_c_emo) in enumerate(dev_data_loader):
                    with torch.no_grad():
                        if USE_TOKEN_TYPE:
                            pred, pred2, pred3 = model(tokens.cuda(),
                                                       masks.cuda(),
                                                       segments.cuda())
                        else:
                            pred, pred2, pred3 = model(tokens.cuda(),
                                                       masks.cuda())

                        loss_label = loss_criterion(
                            pred,
                            e_c.view(-1).cuda()).cuda()
                        loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \
                                     e_c.view(-1).shape[0]

                        loss_binary = loss_criterion_binary(
                            pred2,
                            e_c_binary.view(-1).cuda()).cuda()
                        loss_binary = torch.matmul(
                            torch.gather(weight_binary, 0,
                                         e_c_binary.view(-1).cuda()),
                            loss_binary) / e_c.view(-1).shape[0]

                        loss_emo = loss_criterion_emo_only(
                            pred3, e_c_emo.cuda())

                        loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 *
                                loss_emo) / float(1 + LAMBDA1 + LAMBDA2)

                        dev_loss += loss.data.cpu().numpy() * tokens.shape[0]

                        # pred_list.append(pred.data.cpu().numpy())
                        # gold_list.append(e_c.numpy())
                        del pred, loss

                # pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1)
                # gold_list = np.concatenate(gold_list, axis=0)
                print('Training loss:',
                      train_loss / len(train_data_set),
                      end='\t')
                print('Dev loss:', dev_loss / len(dev_data_set))
                # print(classification_report(gold_list, pred_list, target_names=EMOS))
                # get_metrics(pred_list, gold_list)
                # checking diverge
                if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4:
                    print("Model diverged, retry")
                    is_diverged = True
                    break

                if es.step(dev_loss):  # overfitting
                    print('overfitting, loading best model ...')
                    if num_epoch == 1:
                        is_diverged = True
                        final_pred_best = deepcopy(final_pred_list_test)
                        pred_list_test_best = deepcopy(pred_list_test)
                    break
                else:
                    if es.is_best():
                        print('saving best model ...')
                        if final_pred_best is not None:
                            del final_pred_best
                        final_pred_best = deepcopy(final_pred_list_test)
                        if pred_list_test_best is not None:
                            del pred_list_test_best
                        pred_list_test_best = deepcopy(pred_list_test)
                    else:
                        print('not best model, ignoring ...')
                        if final_pred_best is None:
                            final_pred_best = deepcopy(final_pred_list_test)
                        if pred_list_test_best is None:
                            pred_list_test_best = deepcopy(pred_list_test)

                print('Gold Dev ...')
                pred_list_test = []
                model.eval()
                for i, (tokens, masks, segments, e_c, e_c_binary,
                        e_c_emo) in enumerate(gold_dev_data_loader):
                    with torch.no_grad():
                        if USE_TOKEN_TYPE:
                            pred, _, _ = model(tokens.cuda(), masks.cuda(),
                                               segments.cuda())
                        else:
                            pred, _, _ = model(tokens.cuda(), masks.cuda())
                        pred_list_test.append(pred.data.cpu().numpy())

                pred_list_test = np.argmax(np.concatenate(pred_list_test,
                                                          axis=0),
                                           axis=1)
                # get_metrics(load_dev_labels('data/dev.txt'), pred_list_test)

                print('Gold Test ...')
                final_pred_list_test = []
                model.eval()
                for i, (tokens, masks, segments, e_c, e_c_binary,
                        e_c_emo) in enumerate(gold_test_data_loader):
                    with torch.no_grad():
                        if USE_TOKEN_TYPE:
                            pred, _, _ = model(tokens.cuda(), masks.cuda(),
                                               segments.cuda())
                        else:
                            pred, _, _ = model(tokens.cuda(), masks.cuda())
                        final_pred_list_test.append(pred.data.cpu().numpy())

                final_pred_list_test = np.argmax(np.concatenate(
                    final_pred_list_test, axis=0),
                                                 axis=1)
                # get_metrics(load_dev_labels('data/test.txt'), final_pred_list_test)

            if is_diverged:
                print("Reinitialize model ...")
                del model
                continue
            all_fold_results.append(pred_list_test_best)
            real_test_results.append(final_pred_best)

            del model
            break
示例#3
0
    def one_fold(num_fold, train_index, dev_index):
        print("Training on fold:", num_fold)
        X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index]
        y_train, y_dev = y[train_index], y[dev_index]

        # construct data loader
        # for one fold, test data comes from k fold split.
        train_data_set = TrainDataSet(X_train,
                                      y_train,
                                      EMAI_PAD_LEN,
                                      SENT_PAD_LEN,
                                      word2id,
                                      use_unk=True)

        dev_data_set = TrainDataSet(X_dev,
                                    y_dev,
                                    EMAI_PAD_LEN,
                                    SENT_PAD_LEN,
                                    word2id,
                                    use_unk=True)
        dev_data_loader = DataLoader(dev_data_set,
                                     batch_size=BATCH_SIZE,
                                     shuffle=False)
        # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        final_pred_best = None

        # This is to prevent model diverge, once happen, retrain
        while True:
            is_diverged = False
            # Model is defined in HierarchicalPredictor
            model = HierarchicalAttPredictor(SENT_EMB_DIM,
                                             SENT_HIDDEN_SIZE,
                                             CTX_LSTM_DIM,
                                             num_of_vocab,
                                             SENT_PAD_LEN,
                                             id2word,
                                             USE_ELMO=True,
                                             ADD_LINEAR=False)
            model.load_embedding(emb)
            model.deepmoji_model.load_specific_weights(
                PRETRAINED_PATH, exclude_names=['output_layer'])
            model.cuda()

            # model = nn.DataParallel(model)
            # model.to(device)

            optimizer = optim.Adam(model.parameters(),
                                   lr=learning_rate,
                                   amsgrad=True)  #
            # optimizer = optim.SGD(model.parameters(), lr=learning_rate)
            scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                               gamma=GAMMA)

            # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary)  #
            if opt.loss == 'focal':
                loss_criterion = FocalLoss(gamma=opt.focal)

            elif opt.loss == 'ce':
                loss_criterion = nn.BCELoss()

            es = EarlyStopping(patience=EARLY_STOP_PATIENCE)
            final_pred_list_test = None

            result_print = {}

            for num_epoch in range(MAX_EPOCH):

                # to ensure shuffle at ever epoch
                train_data_loader = DataLoader(train_data_set,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True)

                print('Begin training epoch:', num_epoch, end='...\t')
                sys.stdout.flush()

                # stepping scheduler
                scheduler.step(num_epoch)
                print('Current learning rate', scheduler.get_lr())

                ## Training step
                train_loss = 0
                model.train()

                for i, (a, a_len, emoji_a, e_c) \
                        in tqdm(enumerate(train_data_loader), total=len(train_data_set)/BATCH_SIZE):

                    optimizer.zero_grad()
                    e_c = e_c.type(torch.float)
                    pred = model(a.cuda(), a_len, emoji_a.cuda())
                    loss_label = loss_criterion(pred.squeeze(1),
                                                e_c.view(-1).cuda()).cuda()

                    # training trilogy
                    loss_label.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
                    optimizer.step()

                    train_loss += loss_label.data.cpu().numpy() * a.shape[0]
                    del pred, loss_label

                ## Evaluatation step
                model.eval()
                dev_loss = 0
                # pred_list = []
                for i, (a, a_len, emoji_a, e_c) in enumerate(dev_data_loader):

                    with torch.no_grad():
                        e_c = e_c.type(torch.float)
                        pred = model(a.cuda(), a_len, emoji_a.cuda())

                        loss_label = loss_criterion(
                            pred.squeeze(1),
                            e_c.view(-1).cuda()).cuda()

                        dev_loss += loss_label.data.cpu().numpy() * a.shape[0]

                        # pred_list.append(pred.data.cpu().numpy())
                        # gold_list.append(e_c.numpy())
                        del pred, loss_label

                print('Training loss:',
                      train_loss / len(train_data_set),
                      end='\t')
                print('Dev loss:', dev_loss / len(dev_data_set))

                # print(classification_report(gold_list, pred_list, target_names=EMOS))
                # get_metrics(pred_list, gold_list)

                # Gold Test testing
                print('Final test testing...')
                final_pred_list_test = []
                model.eval()

                for i, (a, a_len,
                        emoji_a) in enumerate(final_test_data_loader):

                    with torch.no_grad():

                        pred = model(a.cuda(), a_len, emoji_a.cuda())

                        final_pred_list_test.append(pred.data.cpu().numpy())
                    del a, pred
                print("final_pred_list_test", len(final_pred_list_test))
                final_pred_list_test = np.concatenate(final_pred_list_test,
                                                      axis=0)
                final_pred_list_test = np.squeeze(final_pred_list_test, axis=1)
                print("final_pred_list_test_concat", len(final_pred_list_test))

                accuracy, precision, recall, f1 = get_metrics(
                    np.asarray(final_test_target_list),
                    np.asarray(final_pred_list_test))

                result_print.update(
                    {num_epoch: [accuracy, precision, recall, f1]})

                if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4:
                    print("Model diverged, retry")
                    is_diverged = True
                    break

                if es.step(dev_loss):  # overfitting
                    print('overfitting, loading best model ...')
                    break
                else:
                    if es.is_best():
                        print('saving best model ...')
                        if final_pred_best is not None:
                            del final_pred_best
                        final_pred_best = deepcopy(final_pred_list_test)

                    else:
                        print('not best model, ignoring ...')
                        if final_pred_best is None:
                            final_pred_best = deepcopy(final_pred_list_test)

            with open(result_path, 'wb') as w:
                pkl.dump(result_print, w)

            if is_diverged:
                print("Reinitialize model ...")
                del model

                continue

            real_test_results.append(np.asarray(final_pred_best))
            # saving model for inference
            torch.save(model.state_dict(), opt.out_path)
            del model
            break
示例#4
0
    def one_fold(num_fold, train_index, dev_index):
        print("Training on fold:", num_fold)
        X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index]
        y_train, y_dev = y[train_index], y[dev_index]

        # construct data loader
        train_data_set = TrainDataSet(X_train,
                                      y_train,
                                      CONV_PAD_LEN,
                                      SENT_PAD_LEN,
                                      word2id,
                                      use_unk=True)

        dev_data_set = TrainDataSet(X_dev,
                                    y_dev,
                                    CONV_PAD_LEN,
                                    SENT_PAD_LEN,
                                    word2id,
                                    use_unk=True)
        dev_data_loader = DataLoader(dev_data_set,
                                     batch_size=BATCH_SIZE,
                                     shuffle=False)
        # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        pred_list_test_best = None
        final_pred_best = None
        # This is to prevent model diverge, once happen, retrain
        while True:
            is_diverged = False
            # Model is defined in HierarchicalPredictor
            model = HierarchicalPredictor(SENT_EMB_DIM,
                                          SENT_HIDDEN_SIZE,
                                          num_of_vocab,
                                          USE_ELMO=True,
                                          ADD_LINEAR=False)
            model.load_embedding(emb)
            model.deepmoji_model.load_specific_weights(
                PRETRAINED_PATH, exclude_names=['output_layer'])
            model.cuda()
            # model = nn.DataParallel(model)
            # model.to(device)
            optimizer = optim.Adam(model.parameters(),
                                   lr=learning_rate,
                                   amsgrad=True)  #
            # optimizer = optim.SGD(model.parameters(), lr=learning_rate)
            scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                               gamma=opt.gamma)

            if opt.w == 1:
                weight_list = [0.3, 0.3, 0.3, 1.7]
                weight_list_binary = [2 - weight_list[-1], weight_list[-1]]
            elif opt.w == 2:
                weight_list = [
                    0.3198680179, 0.246494733, 0.2484349259, 1.74527696
                ]
                weight_list_binary = [2 - weight_list[-1], weight_list[-1]]
            else:
                raise ValueError

            weight_list = [x**FLAT for x in weight_list]
            weight_label = torch.Tensor(weight_list).cuda()

            weight_list_binary = [x**FLAT for x in weight_list_binary]
            weight_binary = torch.Tensor(weight_list_binary).cuda()
            print('classification reweight: ', weight_list)
            print('binary loss reweight = weight_list_binary',
                  weight_list_binary)
            # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary)  #
            if opt.loss == 'focal':
                loss_criterion = FocalLoss(gamma=opt.focal, reduce=False)
                loss_criterion_binary = FocalLoss(gamma=opt.focal,
                                                  reduce=False)  #
            elif opt.loss == 'ce':
                loss_criterion = nn.CrossEntropyLoss(reduce=False)
                loss_criterion_binary = nn.CrossEntropyLoss(reduce=False)  #

            loss_criterion_emo_only = nn.MSELoss()

            es = EarlyStopping(patience=EARLY_STOP_PATIENCE)
            # best_model = None
            final_pred_list_test = None
            pred_list_test = None
            for num_epoch in range(MAX_EPOCH):
                # to ensure shuffle at ever epoch
                train_data_loader = DataLoader(train_data_set,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True)

                print('Begin training epoch:', num_epoch, end='...\t')
                sys.stdout.flush()

                # stepping scheduler
                scheduler.step(num_epoch)
                print('Current learning rate', scheduler.get_lr())

                train_loss = 0
                model.train()
                for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c, e_c, e_c_binary, e_c_emo) \
                        in tqdm(enumerate(train_data_loader), total=len(train_data_set)/BATCH_SIZE):
                    optimizer.zero_grad()
                    elmo_a = elmo_encode(a)
                    elmo_b = elmo_encode(b)
                    elmo_c = elmo_encode(c)

                    pred, pred2, pred3 = model(a.cuda(), a_len, b.cuda(),
                                               b_len, c.cuda(), c_len,
                                               emoji_a.cuda(), emoji_b.cuda(),
                                               emoji_c.cuda(), elmo_a, elmo_b,
                                               elmo_c)

                    loss_label = loss_criterion(pred,
                                                e_c.view(-1).cuda()).cuda()
                    loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \
                                 e_c.view(-1).shape[0]

                    loss_binary = loss_criterion_binary(
                        pred2,
                        e_c_binary.view(-1).cuda()).cuda()
                    loss_binary = torch.matmul(
                        torch.gather(weight_binary, 0,
                                     e_c_binary.view(-1).cuda()),
                        loss_binary) / e_c.view(-1).shape[0]

                    loss_emo = loss_criterion_emo_only(pred3, e_c_emo.cuda())

                    loss = (loss_label + LAMBDA1 * loss_binary +
                            LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2)

                    # loss = torch.matmul(torch.gather(weight, 0, trg.view(-1).cuda()), loss) / trg.view(-1).shape[0]

                    # training trilogy
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
                    optimizer.step()

                    train_loss += loss.data.cpu().numpy() * a.shape[0]
                    del pred, loss, elmo_a, elmo_b, elmo_c, e_c_emo, loss_binary, loss_label, loss_emo

                # Evaluate
                model.eval()
                dev_loss = 0
                # pred_list = []
                # gold_list = []
                for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c, e_c, e_c_binary, e_c_emo)\
                        in enumerate(dev_data_loader):
                    with torch.no_grad():

                        elmo_a = elmo_encode(a)
                        elmo_b = elmo_encode(b)
                        elmo_c = elmo_encode(c)

                        pred, pred2, pred3 = model(a.cuda(), a_len, b.cuda(),
                                                   b_len, c.cuda(), c_len,
                                                   emoji_a.cuda(),
                                                   emoji_b.cuda(),
                                                   emoji_c.cuda(), elmo_a,
                                                   elmo_b, elmo_c)

                        loss_label = loss_criterion(
                            pred,
                            e_c.view(-1).cuda()).cuda()
                        loss_label = torch.matmul(
                            torch.gather(weight_label, 0,
                                         e_c.view(-1).cuda()),
                            loss_label) / e_c.view(-1).shape[0]

                        loss_binary = loss_criterion_binary(
                            pred2,
                            e_c_binary.view(-1).cuda()).cuda()
                        loss_binary = torch.matmul(
                            torch.gather(weight_binary, 0,
                                         e_c_binary.view(-1).cuda()),
                            loss_binary) / e_c.view(-1).shape[0]

                        loss_emo = loss_criterion_emo_only(
                            pred3, e_c_emo.cuda())

                        loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 *
                                loss_emo) / float(1 + LAMBDA1 + LAMBDA2)

                        dev_loss += loss.data.cpu().numpy() * a.shape[0]

                        # pred_list.append(pred.data.cpu().numpy())
                        # gold_list.append(e_c.numpy())
                        del pred, loss, elmo_a, elmo_b, elmo_c, e_c_emo, loss_binary, loss_label, loss_emo

                print('Training loss:',
                      train_loss / len(train_data_set),
                      end='\t')
                print('Dev loss:', dev_loss / len(dev_data_set))
                # print(classification_report(gold_list, pred_list, target_names=EMOS))
                # get_metrics(pred_list, gold_list)
                if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4:
                    print("Model diverged, retry")
                    is_diverged = True
                    break

                if es.step(dev_loss):  # overfitting
                    print('overfitting, loading best model ...')
                    break
                else:
                    if es.is_best():
                        print('saving best model ...')
                        if final_pred_best is not None:
                            del final_pred_best
                        final_pred_best = deepcopy(final_pred_list_test)
                        if pred_list_test_best is not None:
                            del pred_list_test_best
                        pred_list_test_best = deepcopy(pred_list_test)
                    else:
                        print('not best model, ignoring ...')
                        if final_pred_best is None:
                            final_pred_best = deepcopy(final_pred_list_test)
                        if pred_list_test_best is None:
                            pred_list_test_best = deepcopy(pred_list_test)

                # Gold Dev testing...
                print('Gold Dev testing....')
                pred_list_test = []
                model.eval()
                for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b,
                        emoji_c) in enumerate(gold_dev_data_loader):
                    with torch.no_grad():
                        elmo_a = elmo_encode(a)  # , __id2word=ex_id2word
                        elmo_b = elmo_encode(b)
                        elmo_c = elmo_encode(c)

                        pred, _, _ = model(a.cuda(), a_len, b.cuda(), b_len,
                                           c.cuda(), c_len, emoji_a.cuda(),
                                           emoji_b.cuda(), emoji_c.cuda(),
                                           elmo_a, elmo_b, elmo_c)

                        pred_list_test.append(pred.data.cpu().numpy())
                    del elmo_a, elmo_b, elmo_c, a, b, c, pred
                pred_list_test = np.argmax(np.concatenate(pred_list_test,
                                                          axis=0),
                                           axis=1)
                # get_metrics(load_dev_labels('data/dev.txt'), pred_list_test)

                # Testing
                print('Gold test testing...')
                final_pred_list_test = []
                model.eval()
                for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b,
                        emoji_c) in enumerate(test_data_loader):
                    with torch.no_grad():
                        elmo_a = elmo_encode(a)  # , __id2word=ex_id2word
                        elmo_b = elmo_encode(b)
                        elmo_c = elmo_encode(c)

                        pred, _, _ = model(a.cuda(), a_len, b.cuda(), b_len,
                                           c.cuda(), c_len, emoji_a.cuda(),
                                           emoji_b.cuda(), emoji_c.cuda(),
                                           elmo_a, elmo_b, elmo_c)

                        final_pred_list_test.append(pred.data.cpu().numpy())
                    del elmo_a, elmo_b, elmo_c, a, b, c, pred
                final_pred_list_test = np.argmax(np.concatenate(
                    final_pred_list_test, axis=0),
                                                 axis=1)
                # get_metrics(load_dev_labels('data/test.txt'), final_pred_list_test)

            if is_diverged:
                print("Reinitialize model ...")
                del model
                continue

            all_fold_results.append(pred_list_test_best)
            real_test_results.append(final_pred_best)
            del model
            break
示例#5
0
def train(X_train, y_train, X_dev, y_dev, X_test, y_test):
    num_labels = NUM_EMO

    vocab_size = VOCAB_SIZE

    print('NUM of VOCAB' + str(vocab_size))
    train_data = EmotionDataLoader(X_train, y_train, PAD_LEN)
    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

    dev_data = EmotionDataLoader(X_dev, y_dev, PAD_LEN)
    dev_loader = DataLoader(dev_data, batch_size=int(BATCH_SIZE/3)+2, shuffle=False)

    test_data = EmotionDataLoader(X_test, y_test, PAD_LEN)
    test_loader = DataLoader(test_data, batch_size=int(BATCH_SIZE/3)+2, shuffle=False)

    model = AttentionLSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, vocab_size,
                                    num_labels, BATCH_SIZE, att_mode=opt.attention, soft_last=False)

    model.load_embedding(tokenizer.get_embeddings())
    # multi-GPU
    # model = nn.DataParallel(model)
    model.cuda()

    loss_criterion = nn.CrossEntropyLoss()  #

    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    es = EarlyStopping(patience=PATIENCE)
    old_model = None
    for epoch in range(1, 300):
        print('Epoch: ' + str(epoch) + '===================================')
        train_loss = 0
        model.train()
        for i, (data, seq_len, label) in tqdm(enumerate(train_loader),
                                              total=len(train_data)/BATCH_SIZE):
            optimizer.zero_grad()
            y_pred = model(data.cuda(), seq_len)
            loss = loss_criterion(y_pred, label.view(-1).cuda())
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIPS)
            optimizer.step()
            train_loss += loss.data.cpu().numpy() * data.shape[0]
            del y_pred, loss

        test_loss = 0
        model.eval()
        for _, (_data, _seq_len, _label) in enumerate(dev_loader):
            with torch.no_grad():
                y_pred = model(_data.cuda(), _seq_len)
                loss = loss_criterion(y_pred, _label.view(-1).cuda())
                test_loss += loss.data.cpu().numpy() * _data.shape[0]
                del y_pred, loss

        print("Train Loss: " + str(train_loss / len(train_data)) + \
              " Evaluation: " + str(test_loss / len(dev_data)))

        if es.step(test_loss):  # overfitting
            del model
            print('overfitting, loading best model ...')
            model = old_model
            break
        else:
            if es.is_best():
                if old_model is not None:
                    del old_model
                print('saving best model ...')
                old_model = deepcopy(model)
            else:
                print('not best model, ignoring ...')
                if old_model is None:
                    old_model = deepcopy(model)

    with open(f'lstm_{opt.dataset}_model.pt', 'bw') as f:
        torch.save(model.state_dict(), f)

    pred_list = []
    model.eval()
    for _, (_data, _seq_len, _label) in enumerate(test_loader):
        with torch.no_grad():
            y_pred = model(_data.cuda(), _seq_len)
            pred_list.append(y_pred.data.cpu().numpy())  # x[np.where( x > 3.0 )]
            del y_pred

    pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1)

    return pred_list