Пример #1
0
def train_k_fold(k):
    '''
    2. 3
    set up Dataloader for batching
    '''
    fold_size = int(len(raw_train_toefl) / 10)
    for i in range(k):
        val_indices = [z for z in range(i * fold_size, (i + 1) * fold_size)]
        embedded_train_toefl = [[sentences[j], labels[j]]
                                for j in range(len(sentences))
                                if j not in val_indices]
        embedded_val_toefl = [[sentences[j], labels[j]]
                              for j in range(len(sentences))
                              if j in val_indices]

        train_dataset_toefl = TextDataset(
            [example[0] for example in embedded_train_toefl],
            [example[1] for example in embedded_train_toefl])
        val_dataset_toefl = TextDataset(
            [example[0] for example in embedded_val_toefl],
            [example[1] for example in embedded_val_toefl])

        # Data-related hyperparameters
        batch_size = 64
        # Set up a DataLoader for the training, validation, and test dataset
        train_dataloader_toefl = DataLoader(dataset=train_dataset_toefl,
                                            batch_size=batch_size,
                                            shuffle=True,
                                            collate_fn=TextDataset.collate_fn)
        val_dataloader_toefl = DataLoader(dataset=val_dataset_toefl,
                                          batch_size=batch_size,
                                          collate_fn=TextDataset.collate_fn)

        clf, crit = train_model(train_dataloader_toefl, val_dataloader_toefl,
                                i)
Пример #2
0
def predict_vua(rnn_clf):
    preds = {}
    for (embed, txt_sent_id) in embedded_test_vua:
        ex_data = TextDataset([embed], [0])
        ex_dataloader = DataLoader(dataset=ex_data,
                                   batch_size=1,
                                   collate_fn=TextDataset.collate_fn)
        pred = predict(ex_dataloader, rnn_clf, using_GPU)
        preds[txt_sent_id] = pred.item()
    return preds
Пример #3
0
def predict_vua_allpos(RNNseq_model):
    preds = {}
    for (embed, pos_seq, txt_sent_id) in embedded_test_vua:
        ex_data = TextDataset([embed], [pos_seq], [[0 for pos in pos_seq]])
        ex_dataloader = DataLoader(dataset=ex_data,
                                   batch_size=1,
                                   collate_fn=TextDataset.collate_fn)
        pred = predict(ex_dataloader, RNNseq_model, using_GPU)
        preds[txt_sent_id] = pred[0][0]
    return preds
Пример #4
0
def train_k_fold(k):
    clfs = []
    fold_size = int(len(raw_train_vua) / k)

    for i in range(k):
        val_indices = [z for z in range(i * fold_size, (i + 1) * fold_size)]
        embedded_train_vua = [[sentences[j], poss[j], labels[j]]
                              for j in range(len(sentences))
                              if j not in val_indices]
        embedded_val_vua = [[sentences[j], poss[j], labels[j]]
                            for j in range(len(sentences)) if j in val_indices]
        '''
        2. 3
        set up Dataloader for batching
        '''
        # Separate the input (embedded_sequence) and labels in the indexed train sets.
        # raw_train_vua: sentence, label_seq, pos_seq
        # embedded_train_vua: embedded_sentence, pos, labels
        train_dataset_vua = TextDataset(
            [example[0] for example in embedded_train_vua],
            [example[1] for example in embedded_train_vua],
            [example[2] for example in embedded_train_vua])
        val_dataset_vua = TextDataset(
            [example[0] for example in embedded_val_vua],
            [example[1] for example in embedded_val_vua],
            [example[2] for example in embedded_val_vua])

        # Data-related hyperparameters
        batch_size = 64
        # Set up a DataLoader for the training and validation sets
        train_dataloader_vua = DataLoader(dataset=train_dataset_vua,
                                          batch_size=batch_size,
                                          shuffle=True,
                                          collate_fn=TextDataset.collate_fn)
        val_dataloader_vua = DataLoader(dataset=val_dataset_vua,
                                        batch_size=batch_size,
                                        collate_fn=TextDataset.collate_fn)
        clf, crit = train_model(train_dataloader_vua, val_dataloader_vua, i)
        clfs.append(clf)
    return clfs
Пример #5
0
    for example in raw_train_vua
]
labels = [example[2] for example in raw_train_vua]

assert (len(sentences) == len(labels))
'''
2. 3
set up Dataloader for batching
'''
# 10 folds takes up too much RAM, just do 1
fold_size = int(len(raw_train_vua) / 10)
embedded_train_vua = [[sentences[i], labels[i]]
                      for i in range(fold_size, len(sentences))]
embedded_val_vua = [[sentences[i], labels[i]] for i in range(fold_size)]

train_dataset_vua = TextDataset([example[0] for example in embedded_train_vua],
                                [example[1] for example in embedded_train_vua])
val_dataset_vua = TextDataset([example[0] for example in embedded_val_vua],
                              [example[1] for example in embedded_val_vua])

# Data-related hyperparameters
batch_size = 64
# Set up a DataLoader for the training, validation, and test dataset
train_dataloader_vua = DataLoader(dataset=train_dataset_vua,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  collate_fn=TextDataset.collate_fn)
val_dataloader_vua = DataLoader(dataset=val_dataset_vua,
                                batch_size=batch_size,
                                collate_fn=TextDataset.collate_fn)

# Test set
Пример #6
0
optimal_accs = []
predictions_all = []
for i in range(10):
    '''
    2. 3
    set up Dataloader for batching
    '''
    training_sentences = []
    training_labels = []
    training_poss = []
    for j in range(10):
        if j != i:
            training_sentences.extend(ten_folds[j][0])
            training_poss.extend(ten_folds[j][1])
            training_labels.extend(ten_folds[j][2])
    training_dataset_mohx = TextDataset(training_sentences, training_poss,
                                        training_labels)
    val_dataset_mohx = TextDataset(ten_folds[i][0], ten_folds[i][1],
                                   ten_folds[i][2])

    # Data-related hyperparameters
    batch_size = 10
    # Set up a DataLoader for the training, validation, and test dataset
    train_dataloader_mohx = DataLoader(dataset=training_dataset_mohx,
                                       batch_size=batch_size,
                                       shuffle=True,
                                       collate_fn=TextDataset.collate_fn)
    val_dataloader_mohx = DataLoader(dataset=val_dataset_mohx,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     collate_fn=TextDataset.collate_fn)
    """
Пример #7
0
logging.info("Loaded best validation loss: {}".format(best_val_loss))
logging.info("*" * 50)
prev_train_loss = 100

try:
    while True:
        for i in range(10):
            logging.info("10 fold validation turn change...")
            first_in_fold = True
            training_sentences = []
            training_classes = []
            for j in range(10):
                if j != i:
                    training_sentences.extend(ten_foldsd[j][0])
                    training_classes.extend(ten_foldsd[j][1])
            training_dataset_rcc = TextDataset(training_sentences,
                                               training_classes)
            val_dataset_rcc = TextDataset(ten_foldsd[i][0], ten_foldsd[i][1])
            train_dataloader_rcc = DataLoader(
                dataset=training_dataset_rcc,
                batch_size=args.batch_size,
                shuffle=True,
                collate_fn=TextDataset.collate_fn)
            val_dataloader_rcc = DataLoader(dataset=val_dataset_rcc,
                                            batch_size=args.batch_size,
                                            shuffle=False,
                                            collate_fn=TextDataset.collate_fn)
            epoch_base += args.num_epochs
            for epoch in range(args.num_epochs):
                logging.info("Starting epoch {}".format(epoch + 1 +
                                                        epoch_base))
                for (example_text, labels) in train_dataloader_rcc:
Пример #8
0
def trainData():
    device = t.device('cuda:0' if t.cuda.is_available() else 'cpu')
    lang_dataset = TextDataset()
    lang_dataloader = DataLoader(lang_dataset, shuffle=True)
    input_size = lang_dataset.input_lang_words
    hidden_size = 256
    output_size = lang_dataset.output_lang_words
    total_epoch = 20
    use_attn = False

    encoder = convert2Cuda(Encoder(input_size, hidden_size))
    decoder = convert2Cuda(Decoder(hidden_size, output_size, n_layers=2))

    param = list(encoder.parameters()) + list(decoder.parameters())
    optimizer = optim.Adam(param, 1e-3)
    plot_losses = []
    criterion = nn.NLLLoss()

    for epoch in range(total_epoch):
        start = time.time()
        running_loss = 0
        print_loss_total = 0
        total_loss = 0

        for i, data in enumerate(lang_dataloader):
            in_lang, out_lang = data
            in_lang = convert2Cuda(in_lang)
            out_lang = convert2Cuda(out_lang)
            in_lang = Variable(in_lang)
            out_lang = Variable(out_lang)
            encoder_outputs = Variable(t.zeros(MAX_LENGTH,
                                               encoder.hidden_size))
            encoder_outputs = convert2Cuda(encoder_outputs)
            encoder_hidden = encoder.initHidden()
            for ei in range(in_lang.size(1)):
                encoder_output, encoder_hidden = encoder(
                    in_lang[:, ei], encoder_hidden)
                encoder_outputs[ei] = encoder_output[0][0]
            decoder_input = Variable(t.LongTensor([[SOS_token]]))
            decoder_input = convert2Cuda(decoder_input)
            decoder_hidden = encoder_hidden

            loss = 0

            if not use_attn:
                for di in range(out_lang.size(1)):
                    decoder_output, decoder_hidden = decoder(
                        decoder_input, decoder_hidden)
                    loss += criterion(decoder_output, out_lang[:, di])
                    topv, topi = decoder_output.data.topk(1)
                    ni = topi[0][0]
                    decoder_input = Variable(t.LongTensor([[ni]]))
                    decoder_input = convert2Cuda(decoder_input)
                    if ni == EOS_token:
                        break
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.data[0]
            print_loss_total += loss.data[0]
            total_loss += loss.data[0]

            if (i + 1) % 100 == 0:
                print('{}/{},Loss:{:.6f}'.format(i + 1, len(lang_dataloader),
                                                 running_loss / 10))
                running_loss = 0
            if (i + 1) % 10 == 0:
                plot_loss = print_loss_total / 100
                plot_losses.append(plot_loss)
                print_loss_total = 0
        during = time.time() - start
        print('Finish {}/{},Loss:{:.6f}.,Time:{:.0f}s\n'.format(
            epoch + 1, total_epoch, total_loss / len(lang_dataset), during))
    showPlot(plot_losses)
    t.save(encoder.state_dict(), './model/encoder.pth')
    t.save(decoder.state_dict(), './model/decoder.pth')
Data Embedding
optional: Bert or Glove. Default Glove
"""
word2idx, idx2word = get_word2idx_idx2word(train_vocab)
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        normalization=False)

train_embedded_text, train_labels = embed_sentences(train_data, word2idx,
                                                    glove_embeddings)
test_embedded_text, test_labels = embed_sentences(test_data, word2idx,
                                                  glove_embeddings)
"""
Produce Dataset & DataLoader
"""
train_dataset = TextDataset(train_embedded_text, train_sample_ms,
                            train_sam_sen_ms, train_labels)
test_dataset = TextDataset(test_embedded_text, test_sample_ms, test_sam_sen_ms,
                           test_labels)

batch_size = 4
train_dataLoader = DataLoader(dataset=train_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              collate_fn=TextDataset.collate_fn)
test_dataLoader = DataLoader(dataset=test_dataset,
                             batch_size=batch_size,
                             shuffle=True,
                             collate_fn=TextDataset.collate_fn)
"""
Model loading and training
"""
Пример #10
0
optimal_ps = []
optimal_rs = []
optimal_accs = []
predictions_all = []
for i in range(10):
    '''
    2. 3
    set up Dataloader for batching
    '''
    training_sentences = []
    training_labels = []
    for j in range(10):
        if j != i:
            training_sentences.extend(ten_folds[j][0])
            training_labels.extend(ten_folds[j][1])
    training_dataset_poetry = TextDataset(training_sentences, training_labels)
    val_dataset_poetry = TextDataset(ten_folds[i][0], ten_folds[i][1])

    # Data-related hyperparameters
    batch_size = 20
    # Set up a DataLoader for the training, validation, and test dataset
    train_dataloader_poetry = DataLoader(dataset=training_dataset_poetry,
                                         batch_size=batch_size,
                                         shuffle=True,
                                         collate_fn=TextDataset.collate_fn)
    val_dataloader_poetry = DataLoader(dataset=val_dataset_poetry,
                                       batch_size=batch_size,
                                       shuffle=False,
                                       collate_fn=TextDataset.collate_fn)
    """
    3. Model training
Пример #11
0
logging.info("*" * 50)
prev_train_loss = 999999

try:
    while True:
        for i in range(1000):
            logging.info("1000 fold validation turn change...")
            first_in_fold = True
            training_sentences = []
            training_labels = []
            for j in range(1000):
                if j != i:
                    training_sentences.extend(thousand_folds[j][0])
                    training_labels.extend(thousand_folds[j][1])
            training_dataset_rcc = TextDataset(training_sentences,
                                               training_labels, word2idx,
                                               glove_embeddings, elmo)
            val_dataset_rcc = TextDataset(thousand_folds[i][0],
                                          thousand_folds[i][1], word2idx,
                                          glove_embeddings, elmo)
            train_dataloader_rcc = DataLoader(
                dataset=training_dataset_rcc,
                batch_size=args.batch_size,
                shuffle=True,
                collate_fn=TextDataset.collate_fn)
            val_dataloader_rcc = DataLoader(dataset=val_dataset_rcc,
                                            batch_size=args.batch_size,
                                            shuffle=False,
                                            collate_fn=TextDataset.collate_fn)
            epoch_base += args.num_epochs
            for epoch in range(args.num_epochs):
Пример #12
0
def train_model():
    optimal_f1s = []
    optimal_ps = []
    optimal_rs = []
    optimal_accs = []
    # predictions_all = []

    for i in tqdm(range(10)):
        '''
        2. 3
        set up Dataloader for batching
        '''
        training_sentences = []
        training_labels = []
        for j in range(10):
            if j != i:
                training_sentences.extend(ten_folds[j][0])
                training_labels.extend(ten_folds[j][1])
        training_dataset_trofi = TextDataset(training_sentences, training_labels)
        val_dataset_trofi = TextDataset(ten_folds[i][0], ten_folds[i][1])

        # Data-related hyperparameters
        batch_size = 10
        # Set up a DataLoader for the training, validation, and test dataset
        train_dataloader_trofi = DataLoader(dataset=training_dataset_trofi, batch_size=batch_size, shuffle=True,
                                          collate_fn=TextDataset.collate_fn)
        val_dataloader_trofi = DataLoader(dataset=val_dataset_trofi, batch_size=batch_size, shuffle=False,
                                          collate_fn=TextDataset.collate_fn)
        """
        3. Model training
        """
        '''
        3. 1 
        set up model, loss criterion, optimizer
        '''

        # Instantiate the model
        # embedding_dim = glove + elmo + suffix indicator
        # dropout1: dropout on input to RNN
        # dropout2: dropout in RNN; would be used if num_layers=1
        # dropout3: dropout on hidden state of RNN to linear layer
        rnn_clf = RNNSequenceClassifier(num_classes=2, embedding_dim=300+1024+50, hidden_size=300,
                                        num_layers=1, bidir=True,
                                        dropout1=0.2, dropout2=0, dropout3=0)
        # Move the model to the GPU if available
        if using_GPU:
            rnn_clf = rnn_clf.cuda()
        # Set up criterion for calculating loss
        nll_criterion = nn.NLLLoss()
        # Set up an optimizer for updating the parameters of the rnn_clf
        rnn_clf_optimizer = optim.Adam(rnn_clf.parameters(), lr=0.001)
        # Number of epochs (passes through the dataset) to train the model for.
        num_epochs = 15

        '''
        3. 2
        train model
        '''
        training_loss = []
        val_loss = []
        training_f1 = []
        val_f1 = []
        val_p = []
        val_r = []
        val_acc = []
        # A counter for the number of gradient updates
        num_iter = 0
        train_dataloader = train_dataloader_trofi
        val_dataloader = val_dataloader_trofi
        model_index = 0
        for epoch in range(num_epochs):
            # print("Starting epoch {}".format(epoch + 1))
            for (example_text, example_lengths, labels) in train_dataloader:
                example_text = Variable(example_text)
                example_lengths = Variable(example_lengths)
                labels = Variable(labels)
                if using_GPU:
                    example_text = example_text.cuda()
                    example_lengths = example_lengths.cuda()
                    labels = labels.cuda()
                # predicted shape: (batch_size, 2)
                predicted = rnn_clf(example_text, example_lengths)
                batch_loss = nll_criterion(predicted, labels)
                rnn_clf_optimizer.zero_grad()
                batch_loss.backward()
                rnn_clf_optimizer.step()
                num_iter += 1
                # Calculate validation and training set loss and accuracy every 200 gradient updates
                if num_iter % 200 == 0:
                    avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(val_dataloader, rnn_clf, nll_criterion, using_GPU)
                    val_loss.append(avg_eval_loss)
                    val_f1.append(f1)
                    val_p.append(precision)
                    val_r.append(recall)
                    val_acc.append(eval_accuracy.item())
                    # print(
                    #     "Iteration {}. Validation Loss {}. Validation Accuracy {}. Validation Precision {}. Validation Recall {}. Validation F1 {}. Validation class-wise F1 {}.".format(
                    #         num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1))
                    # filename = f'../models/classification/TroFi_fold_{str(i)}_iter_{str(num_iter)}.pt'
                    # torch.save(rnn_clf, filename)
                    model_index += 1
    #                 avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(train_dataloader, rnn_clf, nll_criterion, using_GPU)
    #                 training_loss.append(avg_eval_loss)
    #                 training_f1.append(f1)
    #                 print(
    #                     "Iteration {}. Training Loss {}. Training Accuracy {}. Training Precision {}. Training Recall {}. Training F1 {}. Training class-wise F1 {}.".format(
    #                         num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1))

        """
        additional trianing!
        """
    #     rnn_clf_optimizer = optim.Adam(rnn_clf.parameters(), lr=0.0005)
    #     for epoch in range(num_epochs):
    #         print("Starting epoch {}".format(epoch + 1))
    #         for (example_text, example_lengths, labels) in train_dataloader:
    #             example_text = Variable(example_text)
    #             example_lengths = Variable(example_lengths)
    #             labels = Variable(labels)
    #             if using_GPU:
    #                 example_text = example_text.cuda()
    #                 example_lengths = example_lengths.cuda()
    #                 labels = labels.cuda()
    #             # predicted shape: (batch_size, 2)
    #             predicted = rnn_clf(example_text, example_lengths)
    #             batch_loss = nll_criterion(predicted, labels)
    #             rnn_clf_optimizer.zero_grad()
    #             batch_loss.backward()
    #             rnn_clf_optimizer.step()
    #             num_iter += 1
    #             # Calculate validation and training set loss and accuracy every 200 gradient updates
    #             if num_iter % 100 == 0:
    #                 avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(val_dataloader, rnn_clf, nll_criterion, using_GPU)
    #                 val_loss.append(avg_eval_loss)
    #                 val_f1.append(f1)
    #                 val_p.append(precision)
    #                 val_r.append(recall)
    #                 val_acc.append(eval_accuracy)
    #                 print(
    #                     "Iteration {}. Validation Loss {}. Validation Accuracy {}. Validation Precision {}. Validation Recall {}. Validation F1 {}. Validation class-wise F1 {}.".format(
    #                         num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1))
    #                 model_index += 1

        # print("Training done for fold {}".format(i))

        """
        3.3
        plot the training process: MET F1 and losses for validation and training dataset
        """
    #     plt.figure(0)
    #     plt.title('F1 for TroFI dataset on fold ' + str(i))
    #     plt.xlabel('iteration (unit:200)')
    #     plt.ylabel('F1')
    #     plt.plot(val_f1,'g')
    #     plt.plot(val_p,'r')
    #     plt.plot(val_r,'b')
    #     plt.plot(val_acc,'c')
    #     plt.plot(training_f1, 'b')
    #     plt.legend(['Validation F1', 'Validation precision', 'validaiton recall', 'validation accuracy', 'Training F1'], loc='upper right')
    #     plt.show()


    #     plt.figure(1)
    #     plt.title('Loss for TroFi dataset on fold ' + str(i))
    #     plt.xlabel('iteration (unit:200)')
    #     plt.ylabel('Loss')
    #     plt.plot(val_loss,'g')
    #     plt.plot(training_loss, 'b')
    #     plt.legend(['Validation loss', 'Training loss'], loc='upper right')
    #     plt.show()

        """
        store the best f1
        """
        # print('val_f1: ', val_f1)
        idx = 0
        if math.isnan(max(val_f1)):
            optimal_f1s.append(max(val_f1[6:]))
            idx = val_f1.index(optimal_f1s[-1])
            optimal_ps.append(val_p[idx])
            optimal_rs.append(val_r[idx])
            optimal_accs.append(val_acc[idx])
        else:
            optimal_f1s.append(max(val_f1))
            idx = val_f1.index(optimal_f1s[-1])
            optimal_ps.append(val_p[idx])
            optimal_rs.append(val_r[idx])
            optimal_accs.append(val_acc[idx])
    #     filename = '../models/LSTMSuffixElmoAtt_TroFi_fold_' + str(i) + '_epoch_' + str(idx) + '.pt'
    #     temp_model = torch.load(filename)
    #     print('best model: ', filename)
    #     predictions_all.extend(test(val_dataloader_TroFi, temp_model, using_GPU))
    return np.mean(np.array(optimal_ps)), np.mean(np.array(optimal_rs)), np.mean(np.array(optimal_f1s)), np.mean(np.array(optimal_accs))
Пример #13
0
    2. 3
    set up Dataloader for batching
    '''
    training_sentences = []
    training_labels = []
    training_poss = []
    training_paraphrase = []
    training_para_sets = []
    for j in range(10):
        if j != i:
            training_sentences.extend(ten_folds[j][0])
            training_poss.extend(ten_folds[j][1])
            training_labels.extend(ten_folds[j][2])
            training_paraphrase.extend(ten_folds[j][3])
            training_para_sets.extend(ten_folds[j][4])
    training_dataset_mohx = TextDataset(training_sentences, training_poss, training_labels, \
                                        training_paraphrase, training_para_sets)
    val_dataset_mohx = TextDataset(ten_folds[i][0], ten_folds[i][1],
                                   ten_folds[i][2], ten_folds[i][3],
                                   ten_folds[i][4])

    # Data-related hyperparameters
    batch_size = 2
    # Set up a DataLoader for the training, validation, and test dataset
    train_dataloader_mohx = DataLoader(dataset=training_dataset_mohx,
                                       batch_size=batch_size,
                                       shuffle=True,
                                       collate_fn=TextDataset.collate_fn)
    val_dataloader_mohx = DataLoader(dataset=val_dataset_mohx,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     collate_fn=TextDataset.collate_fn)
Пример #14
0
def train_model():
    optimal_f1s = []
    optimal_ps = []
    optimal_rs = []
    optimal_accs = []
    for i in tqdm(range(10)):
        '''
        2. 3
        set up Dataloader for batching
        '''
        training_sentences = []
        training_labels = []
        for j in range(10):
            if j != i:
                training_sentences.extend(ten_folds[j][0])
                training_labels.extend(ten_folds[j][1])
        training_dataset_mohX = TextDataset(training_sentences, training_labels)
        val_dataset_mohX = TextDataset(ten_folds[i][0], ten_folds[i][1])

        # Data-related hyperparameters
        batch_size = 10
        # Set up a DataLoader for the training, validation, and test dataset
        train_dataloader_mohX = DataLoader(dataset=training_dataset_mohX, batch_size=batch_size, shuffle=True,
                                          collate_fn=TextDataset.collate_fn)
        val_dataloader_mohX = DataLoader(dataset=val_dataset_mohX, batch_size=batch_size, shuffle=True,
                                          collate_fn=TextDataset.collate_fn)
        """
        3. Model training
        """
        '''
        3. 1 
        set up model, loss criterion, optimizer
        '''
        # Instantiate the model
        # embedding_dim = glove + elmo + suffix indicator
        # dropout1: dropout on input to RNN
        # dropout2: dropout in RNN; would be used if num_layers!=1
        # dropout3: dropout on hidden state of RNN to linear layer
        rnn_clf = RNNSequenceClassifier(num_classes=2, embedding_dim=300+1024+50, hidden_size=300, num_layers=1, bidir=True,
                         dropout1=0.2, dropout2=0, dropout3=0.2)
        # Move the model to the GPU if available
        if using_GPU:
            rnn_clf = rnn_clf.cuda()
        # Set up criterion for calculating loss
        nll_criterion = nn.NLLLoss()
        # Set up an optimizer for updating the parameters of the rnn_clf
        rnn_clf_optimizer = optim.SGD(rnn_clf.parameters(), lr=0.02, momentum=0.9)
        # Number of epochs (passes through the dataset) to train the model for.
        num_epochs = 30

        '''
        3. 2
        train model
        '''
        training_loss = []
        val_loss = []
        training_f1 = []
        val_f1 = []
        val_p = []
        val_r = []
        val_acc = []

        # A counter for the number of gradient updates
        num_iter = 0
        train_dataloader = train_dataloader_mohX
        val_dataloader = val_dataloader_mohX
        for epoch in range(num_epochs):
            # print("Starting epoch {}".format(epoch + 1))
            for (example_text, example_lengths, labels) in train_dataloader:
                example_text = Variable(example_text)
                example_lengths = Variable(example_lengths)
                labels = Variable(labels)
                if using_GPU:
                    example_text = example_text.cuda()
                    example_lengths = example_lengths.cuda()
                    labels = labels.cuda()
                # predicted shape: (batch_size, 2)
                predicted = rnn_clf(example_text, example_lengths)
                batch_loss = nll_criterion(predicted, labels)
                rnn_clf_optimizer.zero_grad()
                batch_loss.backward()
                rnn_clf_optimizer.step()
                num_iter += 1
                # Calculate validation and training set loss and accuracy every 200 gradient updates
                if num_iter % 200 == 0:
                    avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(val_dataloader, rnn_clf, nll_criterion, using_GPU)
                    val_loss.append(avg_eval_loss)
                    val_f1.append(f1)
                    val_p.append(precision)
                    val_r.append(recall)
                    val_acc.append(eval_accuracy.item())
                    # print(
                    #     "Iteration {}. Validation Loss {}. Validation Accuracy {}. Validation Precision {}. Validation Recall {}. Validation F1 {}. Validation class-wise F1 {}.".format(
                    #         num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1))
                    # filename = f'../models/classification/MOHX_fold_{str(i)}_iter_{str(num_iter)}.pt'
                    # torch.save(rnn_clf, filename)
                    avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(train_dataloader, rnn_clf, nll_criterion, using_GPU)
                    training_loss.append(avg_eval_loss)
                    training_f1.append(f1)
                    # print(
                    #     "Iteration {}. Training Loss {}. Training Accuracy {}. Training Precision {}. Training Recall {}. Training F1 {}. Training class-wise F1 {}.".format(
                    #         num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1))
        # print("Training done for fold {}".format(i))

        # store the best f1
        idx = 0
        if math.isnan(max(val_f1)):
            optimal_f1s.append(max(val_f1[6:]))
            idx = val_f1.index(optimal_f1s[-1])
            optimal_ps.append(val_p[idx])
            optimal_rs.append(val_r[idx])
            optimal_accs.append(val_acc[idx])
        else:
            optimal_f1s.append(max(val_f1))
            idx = val_f1.index(optimal_f1s[-1])
            optimal_ps.append(val_p[idx])
            optimal_rs.append(val_r[idx])
            optimal_accs.append(val_acc[idx])
    return np.mean(np.array(optimal_ps)), np.mean(np.array(optimal_rs)), np.mean(np.array(optimal_f1s)), np.mean(np.array(optimal_accs))
    # print('F1 on MOH-X by 10-fold = ', optimal_f1s)
    # print('F1 on MOH-X = ', np.mean(np.array(optimal_f1s)))

    """
Пример #15
0
def train_model():
    '''
    2. 3 10-fold cross validation
    '''
    # separate the embedded_sentences and labels into 2 list, in order to pass into the TextDataset as argument
    sentences = [example[0] for example in embedded_trofi]
    poss = [example[1] for example in embedded_trofi]
    labels = [example[2] for example in embedded_trofi]
    # ten_folds is a list of 10 tuples, each tuple is (list_of_embedded_sentences, list_of_corresponding_labels)
    ten_folds = []
    fold_size = int(3737 / 10)
    for i in range(10):
        ten_folds.append((sentences[i * fold_size:(i + 1) * fold_size],
                          poss[i * fold_size:(i + 1) * fold_size],
                          labels[i * fold_size:(i + 1) * fold_size]))

    idx2pos = {0: 'words that are not focus verbs', 1: 'focus verb'}

    optimal_f1s = []
    optimal_ps = []
    optimal_rs = []
    optimal_accs = []
    predictions_all = []
    for i in tqdm(range(10)):
        '''
        2. 3
        set up Dataloader for batching
        '''
        training_sentences = []
        training_labels = []
        training_poss = []
        for j in range(10):
            if j != i:
                training_sentences.extend(ten_folds[j][0])
                training_poss.extend(ten_folds[j][1])
                training_labels.extend(ten_folds[j][2])
        training_dataset_trofi = TextDataset(training_sentences, training_poss,
                                             training_labels)
        val_dataset_trofi = TextDataset(ten_folds[i][0], ten_folds[i][1],
                                        ten_folds[i][2])

        # Data-related hyperparameters
        batch_size = 10
        # Set up a DataLoader for the training, validation, and test dataset
        train_dataloader_trofi = DataLoader(dataset=training_dataset_trofi,
                                            batch_size=batch_size,
                                            shuffle=True,
                                            collate_fn=TextDataset.collate_fn)
        val_dataloader_trofi = DataLoader(dataset=val_dataset_trofi,
                                          batch_size=batch_size,
                                          shuffle=False,
                                          collate_fn=TextDataset.collate_fn)
        """
        3. Model training
        """
        '''
        3. 1 
        set up model, loss criterion, optimizer
        '''
        # Instantiate the model
        # embedding_dim = glove + elmo + suffix indicator
        # dropout1: dropout on input to RNN
        # dropout2: dropout in RNN; would be used if num_layers=1
        # dropout3: dropout on hidden state of RNN to linear layer
        RNNseq_model = RNNSequenceModel(num_classes=2,
                                        embedding_dim=300 + 1024,
                                        hidden_size=300,
                                        num_layers=1,
                                        bidir=True,
                                        dropout1=0.5,
                                        dropout2=0,
                                        dropout3=0.2)
        # Move the model to the GPU if available
        if using_GPU:
            RNNseq_model = RNNseq_model.cuda()
        # Set up criterion for calculating loss
        loss_criterion = nn.NLLLoss()
        # Set up an optimizer for updating the parameters of the rnn_clf
        rnn_optimizer = optim.Adam(RNNseq_model.parameters(), lr=0.001)
        # Number of epochs (passes through the dataset) to train the model for.
        num_epochs = 10
        '''
        3. 2
        train model
        '''
        train_loss = []
        val_loss = []
        performance_matrix = None
        val_f1 = []
        val_p = []
        val_r = []
        val_acc = []
        train_f1 = []
        # A counter for the number of gradient updates
        num_iter = 0
        model_index = 0
        comparable = []
        for epoch in range(num_epochs):
            # print("Starting epoch {}".format(epoch + 1))
            for (__, example_text, example_lengths,
                 labels) in train_dataloader_trofi:
                example_text = Variable(example_text)
                example_lengths = Variable(example_lengths)
                labels = Variable(labels)
                if using_GPU:
                    example_text = example_text.cuda()
                    example_lengths = example_lengths.cuda()
                    labels = labels.cuda()
                # predicted shape: (batch_size, seq_len, 2)
                predicted = RNNseq_model(example_text, example_lengths)
                batch_loss = loss_criterion(predicted.view(-1, 2),
                                            labels.view(-1))
                rnn_optimizer.zero_grad()
                batch_loss.backward()
                rnn_optimizer.step()
                num_iter += 1
                # Calculate validation and training set loss and accuracy every 200 gradient updates
                if num_iter % 200 == 0:
                    avg_eval_loss, performance_matrix = evaluate(
                        idx2pos, val_dataloader_trofi, RNNseq_model,
                        loss_criterion, using_GPU)
                    val_loss.append(avg_eval_loss)
                    val_p.append(performance_matrix[1][0])
                    val_r.append(performance_matrix[1][1])
                    val_f1.append(performance_matrix[1][2])
                    val_acc.append(performance_matrix[1][3])
                    # print("Iteration {}. Validation Loss {}.".format(num_iter, avg_eval_loss))
    #                 avg_eval_loss, performance_matrix = evaluate(idx2pos, train_dataloader_trofi, RNNseq_model,
    #                                                              loss_criterion, using_GPU)
    #                 train_loss.append(avg_eval_loss)
    #                 train_f1.append(performance_matrix[1][1])
    #                 print("Iteration {}. Training Loss {}.".format(num_iter, avg_eval_loss))
    #     print("Training done for fold {}".format(i))
        """
        3.3
        plot the training process: MET F1 and losses for validation and training dataset
        """
        #     plt.figure(0)
        #     plt.title('F1 for TroFI dataset on fold ' + str(i))
        #     plt.xlabel('iteration (unit:200)')
        #     plt.ylabel('F1')
        #     plt.plot(val_f1, 'g')
        #     #     plt.plot(train_f1, 'b')
        #     plt.legend(['Validation F1', 'Training F1'], loc='upper right')
        #     plt.show()

        #     plt.figure(1)
        #     plt.title('Loss for TroFi dataset on fold ' + str(i))
        #     plt.xlabel('iteration (unit:200)')
        #     plt.ylabel('Loss')
        #     plt.plot(val_loss, 'g')
        #     #     plt.plot(train_loss, 'b')
        #     plt.legend(['Validation loss', 'Training loss'], loc='upper right')
        #     plt.show()
        """
        store the best f1
        """
        # print('val_f1: ', val_f1)
        idx = 0
        if math.isnan(max(val_f1)):
            optimal_f1s.append(max(val_f1[6:]))
            idx = val_f1.index(optimal_f1s[-1])
            optimal_ps.append(val_p[idx])
            optimal_rs.append(val_r[idx])
            optimal_accs.append(val_acc[idx])
        else:
            optimal_f1s.append(max(val_f1))
            idx = val_f1.index(optimal_f1s[-1])
            optimal_ps.append(val_p[idx])
            optimal_rs.append(val_r[idx])
            optimal_accs.append(val_acc[idx])
    """
    print out the performance
    plot the performance on each fold
    """
    # print('F1 on TroFi by 10-fold = ', optimal_f1s)
    # print('Precision on TroFi = ', np.mean(np.array(optimal_ps)))
    # print('Recall on TroFi = ', np.mean(np.array(optimal_rs)))
    # print('F1 on TroFi = ', np.mean(np.array(optimal_f1s)))
    # print('Accuracy on TroFi = ', np.mean(np.array(optimal_accs)))
    return optimal_f1s, np.mean(np.array(optimal_ps)), np.mean(
        np.array(optimal_rs)), np.mean(np.array(optimal_f1s)), np.mean(
            np.array(optimal_accs))