def load_paragraph_data():
    data_val = pd.read_json(p.sent_split_dir + "val_clustered_sent_split.json")
    data_test = pd.read_json(p.sent_split_dir + "test_clustered_sent_split.json")

    X_val = data_val.tokens
    X_test = data_test.tokens

    y_val = np.array(data_val.label.values)
    y_test = np.array(data_test.label.values)

    dictionary = pd.read_pickle(p.dict_path)
    X_val = TextDataset._text2idx(X_val, dictionary.word2idx)
    X_test = TextDataset._text2idx(X_test, dictionary.word2idx)

    return X_val, y_val, X_test, y_test, dictionary
def check_loss_and_accuracy(grouped):
    loss_list = []
    preds = []
    label_list = []
    for name, group in grouped:
        tokens = TextDataset._text2idx(group.tokens, dictionary.word2idx)
        labels = np.array(group.label.values)
        tokens, labels = process_batch(tokens, labels)
        if config.para_pooling == 'attn':
            y_pred, _ = model.forward(tokens)
        else:
            y_pred = model.forward(tokens)

        labels = labels.view(labels.shape[0], -1)
        loss = criterion(y_pred.cuda(), labels[0])
        loss_list.append(loss.item())

        _, y_pred = torch.max(y_pred, 1)
        preds.append(y_pred.item())
        label_list.append(labels[0].item())
    preds = np.array(preds)
    label_list = np.array(label_list)
    precision, recall, f1, _ = precision_recall_fscore_support(label_list,
                                                               preds,
                                                               average='macro')
    return np.mean(np.array(loss_list)), accuracy_score(
        label_list,
        preds), precision, recall, f1, confusion_matrix(label_list, preds)
def get_prediction(grouped, combine):
    total_pred = list()
    total_targets = list()
    #
    # id_prob = dict()
    # id_pred = dict()
    # id_label = dict()
    word_model.eval()

    for name, group in grouped:
        tokens = TextDataset._text2idx(group.tokens, word2idx)
        labels = np.array(group.label.values)
        tokens, labels = process_batch(tokens, labels)
        # prediction, soft_preds = test_accuracy_full_batch(tokens, labels, word_attn, sent_attn)
        logits, hidden = word_model.forward(tokens)
        soft_preds = F.softmax(logits, dim=1)
        _, prediction = torch.max(soft_preds, 1)
        if combine == 'majority':
            document_pred = 1 if torch.nonzero(prediction).size(0) >= prediction.shape[0] / 2 else 0
        elif combine == 'avg':
            document_pred = 1 if torch.mean(soft_preds, 0)[0].item() < 0.5 else 0
        target = labels[0].item()


        total_pred.append(document_pred)
        total_targets.append(target)

    return total_pred, total_targets
def get_prediction(grouped, word2idx, word_attn, sent_attn, combine):
    total_pred = list()
    total_targets = list()
    #
    # id_prob = dict()
    # id_pred = dict()
    # id_label = dict()
    word_attn.eval()
    sent_attn.eval()

    for name, group in grouped:
        tokens = TextDataset._text2idx(group.tokens, word2idx)
        labels = np.array(group.label.values)
        tokens, labels = process_batch(tokens, labels)
        soft_preds = model_forward(config.para_pooling, word_attn, sent_attn,
                                   tokens)
        _, prediction = torch.max(soft_preds, 1)

        if combine == 'majority':
            document_pred = 1 if torch.nonzero(prediction).size(
                0) >= prediction.shape[0] / 2 else 0
        elif combine == 'avg':
            document_pred = 1 if torch.mean(torch.exp(soft_preds),
                                            0)[0].item() < 0.5 else 0
        target = labels[0].item()

        # id_prob[name] = torch.transpose(torch.exp(soft_preds), 0, 1)[0].tolist()
        # id_pred[name] = document_pred
        # id_label[name] = target

        total_pred.append(document_pred)
        total_targets.append(target)

    return total_pred, total_targets
示例#5
0
def get_prediction_slength(grouped):
    preds = []
    label_list = []
    slength_list = []
    # for every batch
    for name, group in grouped:
        s = np.sum(list(map(lambda x: len(x), group.tokens)))
        slength_list.append(s)
        tokens = TextDataset._text2idx(group.tokens, dictionary.word2idx)
        labels = np.array(group.label.values)
        tokens, labels = process_batch(tokens, labels)
        if config.pooling == 'attn':
            y_pred, _, _ = model.forward(tokens)
        else:
            y_pred = model.forward(tokens)
        _, y_pred = torch.max(y_pred, 1)
        preds.append(y_pred.item())
        label_list.append(labels[0].item())
    return preds, label_list, slength_list
def check_loss_and_accuracy(grouped):
    loss = []
    preds = []
    labels = []
    for name, group in grouped:
        tokens = TextDataset._text2idx(group.tokens, dictionary.word2idx)
        labels = np.array(group.label.values)
        tokens, labels = process_batch(tokens, labels)
        y_pred = model.forward(tokens)
        loss.append(loss.item())

        loss = criterion(y_pred.cuda(), labels[0])
        _, y_pred = torch.max(y_pred, 1)
        preds.append(np.ndarray.flatten(y_pred.data.cpu().numpy()))
        labels.append(np.ndarray.flatten(labels[0]))
    preds = np.array([item for sublist in preds for item in sublist])
    labels = np.array([item for sublist in labels for item in sublist])
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    return np.mean(np.array(loss)), accuracy_score(
        labels, preds), precision, recall, f1, confusion_matrix(labels, preds)
def check_loss_and_accuracy(grouped, model, dictionary):
    preds = []
    label_list = []
    for name, group in grouped:
        tokens = TextDataset._text2idx(group.tokens, dictionary.word2idx)
        labels = np.array(group.label.values)
        tokens, labels = process_batch(tokens, labels)
        if config.pooling == 'attn':
            y_pred, _, _ = model.forward(tokens)
        elif config.pooling == 'ensem':
            y_pred = model.forward(tokens)

        labels = labels.view(labels.shape[0], -1)

        _, y_pred = torch.max(y_pred, 1)
        preds.append(y_pred.item())
        label_list.append(labels[0].item())
    preds = np.array(preds)
    label_list = np.array(label_list)
    precision, recall, f1, _ = precision_recall_fscore_support(label_list, preds)
    return accuracy_score(label_list, preds), precision, recall, f1, confusion_matrix(label_list, preds)
def train_early_stopping(epoch_number):
    global best_val_loss, best_acc

    loss_epoch = []
    i = 1
    batch_start = time.time()
    for name, group in train_grouped:
        # print(group.tokens.values)
        tokens = TextDataset._text2idx(group.tokens, dictionary.word2idx)
        labels = np.array(group.label.values)
        try:
            tokens, labels = process_batch(tokens, labels)
        except:
            print(tokens)
            sys.exit(0)

        loss = train_data(tokens, labels)
        loss_epoch.append(loss)
        # print loss every n passes
        if i % (p.print_loss_every * 5) == 0:
            print('| epoch   %d | %d/%d batches | ms/batch (%s) | loss %f' %
                  (epoch_number, i % (num_batches + 1), num_batches,
                   time_since(batch_start), np.mean(loss_epoch)))
            batch_start = time.time()
        i += 1

    # word_encoder.eval()
    # sent_encoder.eval()
    model.eval()

    print('-' * 89)
    val_loss, val_acc, precision, recall, f1, conf_matrix = check_loss_and_accuracy(
        val_grouped)
    print(
        '| val set result | valid loss (pure) {:5.4f} | Acc {:8.4f} | Precision {:8.4f} | Recall {:8.4f} '
        '| F1-score {:8.4f}'.format(val_loss, val_acc, precision, recall, f1))
    print('The confusion matrix is: ')
    print(str(conf_matrix))
    print('-' * 89)

    test_loss, test_acc, precision, recall, f1, conf_matrix = check_loss_and_accuracy(
        test_grouped)
    print(
        '| test set result | valid loss (pure) {:5.4f} | Acc {:8.4f} | Precision {:8.4f} | Recall {:8.4f} '
        '| F1-score {:8.4f}'.format(test_loss, test_acc, precision, recall,
                                    f1))
    print('The confusion matrix is: ')
    print(str(conf_matrix))
    print('-' * 89)

    directory = "./experiments/%s/models/" % config.exp_num

    if not os.path.exists(directory):
        os.makedirs(directory)

    if not best_val_loss or val_loss < best_val_loss:
        best_val_loss = val_loss
    else:  # if loss doesn't go down, divide the learning rate by 5.
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * 0.2
    if not best_acc or val_acc > best_acc:
        with open(
                directory + 'para_{}.best_acc.pt'.format(config.para_pooling),
                'wb') as f:
            torch.save(model, f)
        best_acc = val_acc
    with open(
            directory + 'para_{}.epoch-{:02d}.pt'.format(
                config.para_pooling, epoch_number), 'wb') as f:
        torch.save(model, f)

    with open("./experiments/{}/optimizer.pt".format(config.exp_num),
              'wb') as f:
        torch.save(optimizer.state_dict(), f)
def train_early_stopping(epoch_number):
    global best_val_loss, best_acc
    start = time.time()
    loss_epoch = []
    i = 1
    batch_start = time.time()
    for name, group in train_grouped:
        # print(group.tokens.values)
        tokens = TextDataset._text2idx(group.tokens, dictionary.word2idx)
        labels = np.array(group.label.values)
        tokens, labels = process_batch(tokens, labels)

        loss = train_data(tokens, labels)
        loss_epoch.append(loss)
        # print loss every n passes
        if i % (p.print_loss_every * 5) == 0:
            print('| epoch   %d | %d/%d batches | ms/batch (%s) | loss %f' %
                  (epoch_number, i % (num_batches + 1), num_batches,
                   time_since(batch_start), np.mean(loss_epoch)))
            batch_start = time.time()
        i += 1

    word_attn.eval()
    sent_attn.eval()
    model.eval()
    print('-' * 89)
    val_loss, val_acc, precision, recall, f1, conf_matrix = check_loss_and_accuracy(
        val_grouped)
    print('| val set loss  %f | time  %s | Acc  %f' %
          (val_loss, time_since(start), val_acc) + "| Precision: " +
          str(precision) + " | Recall: " + str(recall) + " | F1-score: " +
          str(f1))
    print('The confusion matrix is: ')
    print(str(conf_matrix))
    print('-' * 89)

    test_loss, test_acc, precision, recall, f1, conf_matrix = check_loss_and_accuracy(
        test_grouped)
    print('| test set loss:  %f| Acc  %f ' % (test_loss, test_acc) +
          "| Precision: " + str(precision) + " | Recall: " + str(recall) +
          " | F1-score: " + str(f1))
    print('The confusion matrix is: ')
    print(str(conf_matrix))
    print('-' * 89)

    directory = "./experiments/%s/models/" % config.exp_num

    if not os.path.exists(directory):
        os.makedirs(directory)

    if not best_val_loss or val_loss < best_val_loss:
        best_val_loss = val_loss
    else:  # if loss doesn't go down, divide the learning rate by 5.
        for param_group in model_optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * 0.2
    if not best_acc or val_loss > best_acc:
        with open(directory + p.para_ensem_path[:-3] + '.best_acc.pt',
                  'wb') as f:
            torch.save(model, f)
        best_acc = val_loss
    with open(
            directory + p.para_ensem_path[:-3] +
            '.epoch-{:02d}.pt'.format(epoch_number), 'wb') as f:
        torch.save(model, f)