def one_fold(fold_int, is_nine_folds):
    fold_id = str(fold_int)
    if is_nine_folds:
        fold_path = 'data/Folds_9_Emotions/fold_' + fold_id
        num_labels = 9
    else:
        fold_path = 'data/Folds/fold_' + fold_id
        num_labels = 16

    vocab_size = 5000
    pad_len = 30
    batch_size = 64
    embedding_dim = 200
    hidden_dim = 600

    es = EarlyStop(2)
    word2id, id2word = build_vocab(fold_path, vocab_size, use_unk=True)
    train_data = DataSet(os.path.join(fold_path, 'train.csv'), pad_len,
                         word2id, num_labels)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

    test_data = DataSet(os.path.join(fold_path, 'test.csv'), pad_len, word2id,
                        num_labels)
    test_loader = DataLoader(test_data, batch_size=batch_size)

    model = AttentionLSTMClassifier(embedding_dim, hidden_dim, vocab_size,
                                    word2id, num_labels, batch_size)
    model.load_glove_embedding(id2word)
    model.cuda()

    optimizer = optim.Adam(model.parameters())
    loss_criterion = nn.MSELoss()
    for epoch in range(4):
        print('Epoch:', epoch, '===================================')
        train_loss = 0
        for i, (data, seq_len, label) in enumerate(train_loader):
            data, label, seq_len = sort_batch(data, label, seq_len.view(-1))
            y_pred = model(Variable(data).cuda(), seq_len)
            optimizer.zero_grad()
            loss = loss_criterion(y_pred, Variable(label).cuda())
            loss.backward()
            optimizer.step()
            train_loss += loss.data[0]
        pred_list = []
        gold_list = []
        test_loss = 0
        for i, (data, seq_len, label) in enumerate(test_loader):
            data, label, seq_len = sort_batch(data, label, seq_len.view(-1))
            y_pred = model(Variable(data, volatile=True).cuda(), seq_len)
            loss = loss_criterion(y_pred,
                                  Variable(label, volatile=True).cuda())
            test_loss += loss.data[0]
            pred_list.append(y_pred.data.cpu().numpy())
            gold_list.append(label.numpy())

        print("Train Loss: ", train_loss, " Evaluation: ", test_loss)
        es.new_loss(test_loss)
        if es.if_stop():
            print('Start over fitting')
            break
def one_fold(X_train, y_train, X_test, y_test):
    num_labels = NUM_CLASS
    vocab_size = 20000
    pad_len = 30
    batch_size = 100
    embedding_dim = 200
    hidden_dim = 400
    __use_unk = False
    es = EarlyStop(2)
    word2id, id2word = build_vocab(X_train, vocab_size, use_unk=__use_unk)

    train_data = DataSet(X_train, y_train, pad_len, word2id, num_labels, use_unk=__use_unk)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False)

    test_data = DataSet(X_test, y_test, pad_len, word2id, num_labels, use_unk=__use_unk)
    test_loader = DataLoader(test_data, batch_size=batch_size)

    model = AttentionLSTMClassifier(embedding_dim, hidden_dim, vocab_size, word2id,
                                    num_labels, batch_size, use_att=True)
    model.load_glove_embedding(id2word)
    model.cuda()

    optimizer = optim.Adam(model.parameters())
    loss_criterion = nn.MSELoss()
    for epoch in range(4):
        print('Epoch:', epoch, '===================================')
        train_loss = 0
        for i, (data, seq_len, label) in enumerate(train_loader):
            # print(i)
            data, label, seq_len = sort_batch(data, label, seq_len.view(-1))
            y_pred = model(Variable(data).cuda(), seq_len)
            optimizer.zero_grad()
            loss = loss_criterion(y_pred, Variable(label).cuda())
            loss.backward()
            optimizer.step()
            train_loss += loss.data[0]
        pred_list = []
        gold_list = []
        test_loss = 0
        for i, (data, seq_len, label) in enumerate(test_loader):
            data, label, seq_len = sort_batch(data, label, seq_len.view(-1))
            y_pred = model(Variable(data, volatile=True).cuda(), seq_len)
            loss = loss_criterion(y_pred, Variable(label, volatile=True).cuda())
            test_loss += loss.data[0]
            pred_list.append(y_pred.data.cpu().numpy())
            gold_list.append(label.numpy())

        print("Train Loss: ", train_loss, " Evaluation: ", test_loss)
        es.new_loss(test_loss)
        if es.if_stop():
            print('Start over fitting')
            break

    return np.concatenate(pred_list, axis=0), np.concatenate(gold_list, axis=0)
Пример #3
0
def one_fold(fold_int, is_nine_folds):
    fold_id = str(fold_int)
    if is_nine_folds:
        fold_path = 'data/Folds_9_Emotions/fold_' + fold_id
        num_labels = 9
    else:
        fold_path = 'data/Folds/fold_' + fold_id
        num_labels = 16

    vocab_size = 5000
    pad_len = 30
    batch_size = 64
    hidden_dim = 600

    es = EarlyStop(2)
    word2id, id2word = build_vocab(fold_path, vocab_size, use_unk=True)
    embedding_dim = len(word2id)
    train_data = DataSet(os.path.join(fold_path, 'train.csv'), pad_len,
                         word2id, num_labels)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

    test_data = DataSet(os.path.join(fold_path, 'test.csv'), pad_len, word2id,
                        num_labels)
    test_loader = DataLoader(test_data, batch_size=batch_size)

    model = AttentionLSTMClassifier(embedding_dim, hidden_dim, vocab_size,
                                    word2id, num_labels, batch_size)
    model.load_bog_embedding(word2id)
    model.cuda()

    optimizer = optim.Adam(
        filter(lambda p: p.requires_grad, model.parameters()))
    loss_criterion = nn.BCELoss()
    for epoch in range(4):
        print('Epoch:', epoch, '===================================')
        train_loss = 0
        for i, (data, seq_len, label) in enumerate(train_loader):
            data, label, seq_len = sort_batch(data, label, seq_len.view(-1))
            y_pred = model(Variable(data).cuda(), seq_len)
            optimizer.zero_grad()
            loss = loss_criterion(y_pred, Variable(label).cuda())
            loss.backward()
            optimizer.step()
            train_loss += loss.data[0]
        pred_list = []
        gold_list = []
        test_loss = 0
        for i, (data, seq_len, label) in enumerate(test_loader):
            data, label, seq_len = sort_batch(data, label, seq_len.view(-1))
            y_pred = model(Variable(data, volatile=True).cuda(), seq_len)
            loss = loss_criterion(y_pred,
                                  Variable(label, volatile=True).cuda())
            test_loss += loss.data[0]
            pred_list.append(y_pred.data.cpu().numpy())
            gold_list.append(label.numpy())

        print("Train Loss: ", train_loss, " Evaluation: ", test_loss)
        es.new_loss(test_loss)
        if es.if_stop():
            print('Start over fitting')
            break
    f_ma = []
    f_mi = []
    for threshold in range(0, 100, 5):
        threshold /= 100
        tmp = CalculateFM(np.concatenate(pred_list, axis=0),
                          np.concatenate(gold_list, axis=0),
                          threshold=threshold)
        f_ma.append(tmp['MacroFM'])
        f_mi.append(tmp['MicroFM'])
    return f_ma, f_mi
def one_fold(X_train, y_train, X_dev, y_dev, class_weight):

    num_labels = NUM_CLASS
    vocab_size = 20000
    pad_len = 40
    batch_size = 64
    embedding_dim = 200
    hidden_dim = 500
    __use_unk = False

    word2id, id2word = build_vocab(X_train, vocab_size)

    train_data = DataSet(X_train,
                         y_train,
                         pad_len,
                         word2id,
                         num_labels,
                         use_unk=__use_unk)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

    dev_data = DataSet(X_dev,
                       y_dev,
                       pad_len,
                       word2id,
                       num_labels,
                       use_unk=__use_unk)
    dev_loader = DataLoader(dev_data, batch_size=batch_size, shuffle=False)

    # test_data = TestDataSet(X_test, pad_len, word2id, num_labels, use_unk=__use_unk)
    # test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

    model = AttentionLSTMClassifier(embedding_dim,
                                    hidden_dim,
                                    vocab_size,
                                    word2id,
                                    num_labels,
                                    batch_size,
                                    use_att=False)
    model.load_glove_embedding(id2word)
    model.cuda()
    es = EarlyStop(2)
    optimizer = optim.Adam(model.parameters())

    for epoch in range(30):
        print('Epoch:', epoch, '===================================')
        train_loss = 0
        for i, (data, seq_len, label) in enumerate(train_loader):
            weight = torch.FloatTensor(class_weight)  # re-weight
            weight_expanded = weight.expand(len(data), -1)
            loss_criterion = nn.BCELoss(weight=weight_expanded.cuda())  #
            data, label, seq_len = sort_batch(data, label, seq_len.view(-1))
            y_pred = model(Variable(data).cuda(), seq_len)

            #roc_reward = roc_auc_score(label.numpy().argmax(axis=1), y_pred.data.cpu().numpy()[:, 1])
            optimizer.zero_grad()
            loss = loss_criterion(
                y_pred,
                Variable(label).cuda(
                ))  #* Variable(torch.FloatTensor([roc_reward])).cuda()
            loss.backward()
            optimizer.step()
            train_loss += loss.data[0]

        pred_list = []
        gold_list = []
        test_loss = 0
        for _, (_data, _seq_len, _label) in enumerate(dev_loader):
            data, label, seq_len = sort_batch(_data, _label, _seq_len.view(-1))
            y_pred = model(Variable(data, volatile=True).cuda(), seq_len)
            weight = torch.FloatTensor(class_weight)  # re-weight
            weight_expanded = weight.expand(len(data), -1)
            loss_criterion = nn.BCELoss(weight=weight_expanded.cuda())
            loss = loss_criterion(y_pred,
                                  Variable(label, volatile=True).cuda())
            test_loss += loss.data[0]
            pred_list.append(y_pred.data.cpu().numpy())
            gold_list.append(label.numpy())

        pred_list_2 = np.concatenate(pred_list, axis=0)[:, 1]
        pred_list = np.concatenate(pred_list, axis=0).argmax(axis=1)
        gold_list = np.concatenate(gold_list, axis=0).argmax(axis=1)
        roc = roc_auc_score(gold_list, pred_list_2)
        print('roc:', roc)
        a = accuracy_score(gold_list, pred_list)
        p = precision_score(gold_list, pred_list, average='binary')
        r = recall_score(gold_list, pred_list, average='binary')
        f1 = f1_score(gold_list, pred_list, average='binary')
        print('accuracy:', a, 'precision_score:', p, 'recall:', r, 'f1:', f1)
        print("Train Loss: ", train_loss, " Evaluation: ", test_loss)
        es.new_loss(test_loss)
        if es.if_stop():
            print('Start over fitting')
            break

    return gold_list, pred_list
Пример #5
0
def one_fold(fold_path):

    vocab_size = 20000
    pad_len = 30
    batch_size = 64
    embedding_dim = 200
    hidden_dim = 800
    num_labels = NUM_CLASS

    X, y = cbet_data(os.path.join(fold_path, 'train.csv'))

    train_index, dev_index = stratified_shuffle_split(X, y)
    y = np.asarray(y)
    X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index]
    y_train, y_dev = y[train_index], y[dev_index]

    word2id, id2word = build_vocab(X_train, vocab_size)
    # __X, __y, __pad_len, __word2id, __num_labels
    train_data = DataSet(X_train, y_train, pad_len, word2id, num_labels)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

    dev_data = DataSet(X_dev, y_dev, pad_len, word2id, num_labels)
    dev_loader = DataLoader(dev_data, batch_size=batch_size, shuffle=True)

    X_test, y_test = cbet_data(os.path.join(fold_path, 'test.csv'))
    test_data = DataSet(X_test, y_test, pad_len, word2id, num_labels)
    test_loader = DataLoader(test_data, batch_size=batch_size)

    model = AttentionLSTMClassifier(embedding_dim,
                                    hidden_dim,
                                    vocab_size,
                                    word2id,
                                    num_labels,
                                    batch_size,
                                    use_att=True,
                                    soft_last=True)
    model.load_glove_embedding(id2word)
    model.cuda()

    optimizer = optim.Adam(model.parameters())
    loss_criterion = nn.BCELoss()
    es = EarlyStop(2)
    old_model = None
    for epoch in range(10):
        print('Epoch:', epoch, '===================================')
        train_loss = 0
        for i, (data, seq_len, label) in enumerate(train_loader):
            data, label, seq_len = sort_batch(data, label, seq_len.view(-1))
            y_pred = model(Variable(data).cuda(), seq_len)
            optimizer.zero_grad()
            loss = loss_criterion(y_pred, Variable(label).cuda())
            loss.backward()
            optimizer.step()
            train_loss += loss.data[0]
        pred_list = []
        gold_list = []
        test_loss = 0
        # evaluation
        for i, (data, seq_len, label) in enumerate(dev_loader):
            data, label, seq_len = sort_batch(data, label, seq_len.view(-1))
            y_pred = model(Variable(data, volatile=True).cuda(), seq_len)
            loss = loss_criterion(y_pred,
                                  Variable(label, volatile=True).cuda())
            test_loss += loss.data[0]
            pred_list.append(y_pred.data.cpu().numpy())
            gold_list.append(label.numpy())

        if old_model is not None:
            del old_model
            old_model = copy.deepcopy(model)
        else:
            old_model = copy.deepcopy(model)
        print("Train Loss: ", train_loss, " Evaluation: ", test_loss)
        es.new_loss(test_loss)
        if es.if_stop():
            print('Start over fitting')
            del model
            model = old_model
            break

    # testing
    pred_list = []
    gold_list = []
    test_loss = 0
    for i, (data, seq_len, label) in enumerate(test_loader):
        data, label, seq_len = sort_batch(data, label, seq_len.view(-1))
        y_pred = model(Variable(data, volatile=True).cuda(), seq_len)
        loss = loss_criterion(y_pred, Variable(label, volatile=True).cuda())
        test_loss += loss.data[0]
        pred_list.append(y_pred.data.cpu().numpy())
        gold_list.append(label.numpy())

    return np.concatenate(pred_list, axis=0), np.concatenate(gold_list, axis=0)
def one_fold(X_train, y_train, X_dev, y_dev):
    num_labels = NUM_CLASS
    vocab_size = 30000
    pad_len = 40
    batch_size = 64
    embedding_dim = 200
    hidden_dim = 600
    __use_unk = False

    word2id, id2word = build_vocab(X_train, vocab_size)

    train_data = DataSet(X_train,
                         y_train,
                         pad_len,
                         word2id,
                         num_labels,
                         use_unk=__use_unk)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

    dev_data = DataSet(X_dev,
                       y_dev,
                       pad_len,
                       word2id,
                       num_labels,
                       use_unk=__use_unk)
    dev_loader = DataLoader(dev_data, batch_size=batch_size, shuffle=False)

    # test_data = TestDataSet(X_test, pad_len, word2id, num_labels, use_unk=__use_unk)
    # test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

    model = AttentionLSTMClassifier(embedding_dim,
                                    hidden_dim,
                                    vocab_size,
                                    word2id,
                                    num_labels,
                                    batch_size,
                                    use_att=True,
                                    soft_last=False)
    model.load_glove_embedding(id2word)
    model.cuda()
    es = EarlyStop(2)
    optimizer = optim.Adam(model.parameters(), lr=1e-5)
    loss_criterion = nn.MSELoss()  #
    old_model = None
    for epoch in range(20):
        print('Epoch:', epoch, '===================================')
        train_loss = 0
        model.train()
        for i, (data, seq_len, label) in enumerate(train_loader):
            data, label, seq_len = sort_batch(data, label, seq_len.view(-1))
            y_pred = model(Variable(data).cuda(), seq_len)
            #roc_reward = roc_auc_score(label.numpy().argmax(axis=1), y_pred.data.cpu().numpy()[:, 1])
            optimizer.zero_grad()
            loss = loss_criterion(
                y_pred,
                Variable(label).cuda(
                ))  #* Variable(torch.FloatTensor([roc_reward])).cuda()
            loss.backward()
            optimizer.step()
            train_loss += loss.data[0] * batch_size

        pred_list = []
        gold_list = []
        test_loss = 0
        model.eval()
        for _, (_data, _seq_len, _label) in enumerate(dev_loader):
            data, label, seq_len = sort_batch(_data, _label, _seq_len.view(-1))
            y_pred = model(Variable(data, volatile=True).cuda(), seq_len)
            loss = loss_criterion(
                y_pred,
                Variable(label).cuda(
                ))  #* Variable(torch.FloatTensor([roc_reward])).cuda()
            test_loss += loss.data[0] * batch_size
            y_pred = y_pred.data.cpu().numpy()
            pred_list.append(y_pred)  # x[np.where( x > 3.0 )]
            gold_list.append(label.numpy())

        # pred_list_2 = np.concatenate(pred_list, axis=0)[:, 1]
        pred_list = np.concatenate(pred_list, axis=0)
        gold_list = np.concatenate(gold_list, axis=0)
        # roc = roc_auc_score(gold_list, pred_list_2)
        # print('roc:', roc)
        # a = accuracy_score(gold_list, pred_list)
        # p = precision_score(gold_list, pred_list, average='binary')
        # r = recall_score(gold_list, pred_list, average='binary')
        # f1 = f1_score(gold_list, pred_list, average='binary')
        # print('accuracy:', a, 'precision_score:', p, 'recall:', r, 'f1:', f1)
        print("Train Loss: ", train_loss / len(train_data), " Evaluation: ",
              test_loss / len(dev_data))
        es.new_loss(test_loss)
        if old_model is not None:
            del old_model, old_pred_list
            old_model = copy.deepcopy(model)
            old_pred_list = copy.deepcopy(pred_list)

        else:
            old_model = copy.deepcopy(model)
            old_pred_list = copy.deepcopy(pred_list)

        if es.if_stop():
            print('Start over fitting')
            del model
            model = old_model
            pred_list = old_pred_list
            torch.save(model.state_dict(),
                       open(os.path.join('checkpoint', 'cbet.model'), 'wb'))
            with open('checkpoint/some_data.pkl', 'wb') as f:
                pickle.dump([word2id, id2word], f)
            break

    return gold_list, pred_list, model, pad_len, word2id, num_labels