示例#1
0
def negated_and_rare(model, isLogLin=True):

    if isLogLin:
        data_man = DataManager(batch_size=BATCH_SIZE)
        data_iter = data_man.get_torch_iterator(data_subset=TEST)
        print("Log linear test on negated words: ")

    else:
        data_man = DataManager(data_type=W2V_SEQUENCE,
                               batch_size=BATCH_SIZE,
                               embedding_dim=W2V_EMBEDDING_DIM)
        data_iter = data_man.get_torch_iterator(data_subset=TEST)
        print("W2V test with on negated words: ")

    sent_list = data_iter.dataset.data
    preds = get_predictions_for_data(model, data_iter)
    neg_indices = data_loader.get_negated_polarity_examples(sent_list)
    rare_indices = data_loader.get_rare_words_examples(
        sent_list, data_loader.SentimentTreeBank())
    rare_preds = np.take(preds, rare_indices)
    neg_preds = np.take(preds, neg_indices)
    rare_labels = np.take(get_iter_labels(data_iter), rare_indices)
    neg_labels = np.take(get_iter_labels(data_iter), neg_indices)
    neg_acc = binary_accuracy(neg_preds, neg_labels)
    rare_acc = binary_accuracy(rare_preds, rare_labels)
    print("Negated words accuracy: ", neg_acc)
    print("Rare words accuracy: ", rare_acc)
示例#2
0
文件: ex3.py 项目: Roi262/NLP-EX3
def train_model1(model, data_manager, n_epochs, lr, weight_decay=0.):
    """
    Runs the full training procedure for the given model. The optimization should be done using the Adam
    optimizer with all parameters but learning rate and weight decay set to default.
    :param model: module of one of the models implemented in the exercise
    :param data_manager: the DataManager object
    :param n_epochs: number of times to go over the whole training set
    :param lr: learning rate to be used for optimization
    :param weight_decay: parameter for l2 regularization
    """
    optimizer = torch.optim.Adam(params=model.parameters(),
                                 lr=lr,
                                 weight_decay=weight_decay)
    criterion = torch.nn.modules.loss.BCEWithLogitsLoss()
    tstloss = np.zeros(n_epochs)
    tstacc = np.zeros(n_epochs)
    tst1acc = np.zeros(n_epochs)
    tst2acc = np.zeros(n_epochs)

    tst_data = np.array(data_manager.sentences[TEST])
    labels = np.array(data_manager.get_labels(TEST))
    tst1_indexes = data_loader.get_negated_polarity_examples(tst_data)
    tst2_indexes = data_loader.get_rare_words_examples(
        tst_data, data_manager.sentiment_dataset)
    tst1_data = tst_data[tst1_indexes]
    tst2_data = tst_data[tst2_indexes]
    word2vec = load_pickle("w2v_dict.pkl")

    tst1_data = [
        sentence_to_embedding(sent, word2vec, SEQ_LEN) for sent in tst1_data
    ]
    tst2_data = [
        sentence_to_embedding(sent, word2vec, SEQ_LEN) for sent in tst2_data
    ]
    tst1_labels = labels[tst1_indexes]
    tst2_labels = labels[tst2_indexes]

    tst1_iterator = [[
        torch.FloatTensor([tst1_data[i]]).double().cuda(),
        torch.FloatTensor([tst1_labels[i]]).double().cuda()
    ] for i in range(len(tst1_indexes))]
    tst2_iterator = [[
        torch.FloatTensor([tst2_data[i]]).double().cuda(),
        torch.FloatTensor([tst2_labels[i]]).double().cuda()
    ] for i in range(len(tst2_indexes))]
    for i in range(n_epochs):
        print("epoch: ", i)
        iterator = data_manager.get_torch_iterator()
        tst_iterator = data_manager.get_torch_iterator(TEST)

        train_epoch(model, iterator, optimizer, criterion)
        tstloss[i], tstacc[i] = evaluate(model, tst_iterator, criterion)
        _, tst1acc[i] = evaluate(model, tst1_iterator, criterion)
        _, tst2acc[i] = evaluate(model, tst2_iterator, criterion)

        print("loss", tstloss[i])
        print("acc", tstacc[i])
    return tstloss, tstacc, tst1acc, tst2acc
示例#3
0
文件: del3.py 项目: Roi262/NLP-EX3
def get_special_acc(model, dataset, test_iterator):
    sentences = test_iterator.dataset.data
    y = np.array([])
    for batch in test_iterator:
        y = np.concatenate((y, batch[1].numpy()))
    rare_words_idxs = data_loader.get_rare_words_examples(sentences, dataset)
    negated_polarity_idxs = data_loader.get_negated_polarity_examples(
        sentences)
    preds = get_predictions_for_data(model, test_iterator)
    rare_words_preds = preds[rare_words_idxs]
    negated_polarity_preds = preds[negated_polarity_idxs]
    rare_words_gt = y[rare_words_idxs]
    negated_polarity_gt = y[negated_polarity_idxs]
    rare_test_acc = binary_accuracy(rare_words_preds, rare_words_gt)
    negated_polarity_test_acc = binary_accuracy(negated_polarity_preds,
                                                negated_polarity_gt)
    print("rare words accuracy is {}\nnegated polarity acc is {}".format(
        rare_test_acc, negated_polarity_test_acc))
def special_cases_acc(data_manager, model):
    test_sents = data_manager.sentences[TEST]
    test_batches = list(data_manager.get_torch_iterator(TEST))
    test_vectors = []
    for batch in test_batches:
        for vector in batch[0]:
            test_vectors.append(vector)
    test_labels = data_manager.get_labels(TEST)
    NP_sents_idxs = data_loader.get_negated_polarity_examples(test_sents)
    NP_x = [test_vectors[i].float() for i in NP_sents_idxs]
    NP_y = [test_labels[i] for i in NP_sents_idxs]
    pred = model.predict(torch.stack(NP_x)).detach().numpy()
    NP_acc = binary_accuracy(pred, NP_y)

    RW_sents_idxs = data_loader.get_rare_words_examples(
        test_sents, data_manager.sentiment_dataset)
    RW_x = [test_vectors[i].float() for i in RW_sents_idxs]
    RW_y = [test_labels[i] for i in RW_sents_idxs]
    pred = model.predict(torch.stack(RW_x)).detach().numpy()
    RW_acc = binary_accuracy(pred, RW_y)

    return NP_acc, RW_acc
示例#5
0
def train_model(model, data_manager, n_epochs, lr, weight_decay=0.):
    """
    Runs the full training procedure for the given model. The optimization should be done using the Adam
    optimizer with all parameters but learning rate and weight decay set to default.
    :param model: module of one of the models implemented in the exercise
    :param data_manager: the DataManager object
    :param n_epochs: number of times to go over the whole training set
    :param lr: learning rate to be used for optimization
    :param weight_decay: parameter for l2 regularization
    """
    train_loader = data_manager.get_torch_iterator(TRAIN)
    validation_loader = data_manager.get_torch_iterator(VAL)
    test_loader = data_manager.get_torch_iterator(TEST)
    optimizer = optim.Adam(model.parameters(),
                           lr=lr,
                           weight_decay=weight_decay)
    criterion = nn.BCEWithLogitsLoss()

    train_losses = []
    train_accuracies = []
    validation_losses = []
    validation_accuracies = []
    for epoch in range(n_epochs):
        print(epoch)
        train_loss, train_accuracy = train_epoch(model, train_loader,
                                                 optimizer, criterion, epoch)
        validation_loss, validation_accuracy = evaluate(
            model, validation_loader, criterion)
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
        validation_losses.append(validation_loss)
        validation_accuracies.append(validation_accuracy)

    test_accuracy = get_predictions_for_data(model, test_loader)
    print("Test set: Accuracy: {:.0f}%".format(test_accuracy))

    test_sentences = data_manager.sentences[TEST]
    test_labels = data_manager.get_labels(TEST)
    negated_indexes = get_negated_polarity_examples(test_sentences)
    rare_words_indexes = get_rare_words_examples(
        test_sentences, data_manager.sentiment_dataset)

    all_word_vectors = []
    for batch in list(data_manager.get_torch_iterator(TEST)):
        for vector in batch[0]:
            all_word_vectors.append(vector)
    negated_inputs = [all_word_vectors[i].float() for i in negated_indexes]
    rare_words_inputs = [
        all_word_vectors[i].float() for i in rare_words_indexes
    ]

    negated_labels = [torch.tensor(test_labels[i]) for i in negated_indexes]
    rare_words_labels = [
        torch.tensor(test_labels[i]) for i in rare_words_indexes
    ]

    negated_data = torch.stack(negated_inputs)
    rare_words_data = torch.stack(rare_words_inputs)
    negated_labels = torch.stack(negated_labels)
    rare_words_labels = torch.stack(rare_words_labels)

    negated_dataset = torch.utils.data.TensorDataset(negated_data,
                                                     negated_labels)
    rare_words_dataset = torch.utils.data.TensorDataset(
        rare_words_data, rare_words_labels)

    negated_test_loader = torch.utils.data.DataLoader(negated_dataset)
    rare_words_test_loader = torch.utils.data.DataLoader(rare_words_dataset)

    negated_test_accuracy = get_predictions_for_data(model,
                                                     negated_test_loader)
    rare_words_test_accuracy = get_predictions_for_data(
        model, rare_words_test_loader)

    print("Negated test set: Accuracy: {:.0f}%".format(negated_test_accuracy))

    print("Rare words test set: Accuracy: {:.0f}%".format(
        rare_words_test_accuracy))

    plt.title("Training Loss Curve (batch_size={}, lr={})".format(
        train_loader.batch_size, lr))
    plt.plot(range(n_epochs), train_losses, label="Train")
    plt.plot(range(n_epochs), validation_losses, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Training Loss")
    plt.legend(loc='best')
    plt.show()

    plt.title("Training Accuracy Curve (batch_size={}, lr={})".format(
        train_loader.batch_size, lr))
    plt.plot(range(n_epochs), train_accuracies, label="Train")
    plt.plot(range(n_epochs), validation_accuracies, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Training Accuracy")
    plt.legend(loc='best')
    plt.show()
    def __init__(self,
                 data_type=ONEHOT_AVERAGE,
                 use_sub_phrases=True,
                 dataset_path="stanfordSentimentTreebank",
                 batch_size=50,
                 embedding_dim=None):
        """
        builds the data manager used for training and evaluation.
        :param data_type: one of ONEHOT_AVERAGE, W2V_AVERAGE and W2V_SEQUENCE
        :param use_sub_phrases: if true, training data will include all sub-phrases plus the full sentences
        :param dataset_path: path to the dataset directory
        :param batch_size: number of examples per batch
        :param embedding_dim: relevant only for the W2V data types.
        """

        # load the dataset
        self.sentiment_dataset = data_loader.SentimentTreeBank(
            dataset_path, split_words=True)
        # map data splits to sentences lists
        self.sentences = {}
        self.sentences[NEGATIVE_POLARITY] = [
            self.sentiment_dataset.sentences[i]
            for i in data_loader.get_negated_polarity_examples(
                self.sentiment_dataset.sentences)
        ]
        self.sentences[RARE] = [
            self.sentiment_dataset.sentences[i]
            for i in data_loader.get_rare_words_examples(
                self.sentiment_dataset.sentences, self.sentiment_dataset, 50)
        ]
        if use_sub_phrases:
            self.sentences[
                TRAIN] = self.sentiment_dataset.get_train_set_phrases()
        else:
            self.sentences[TRAIN] = self.sentiment_dataset.get_train_set()

        self.sentences[VAL] = self.sentiment_dataset.get_validation_set()
        self.sentences[TEST] = self.sentiment_dataset.get_test_set()

        # map data splits to sentence input preperation functions
        words_list = list(self.sentiment_dataset.get_word_counts().keys())
        if data_type == ONEHOT_AVERAGE:
            self.sent_func = average_one_hots
            self.sent_func_kwargs = {
                "word_to_ind": get_word_to_ind(words_list)
            }
        elif data_type == W2V_SEQUENCE:
            self.sent_func = sentence_to_embedding

            self.sent_func_kwargs = {
                "seq_len": SEQ_LEN,
                "word_to_vec": create_or_load_slim_w2v(words_list),
                "embedding_dim": embedding_dim
            }
        elif data_type == W2V_AVERAGE:
            self.sent_func = get_w2v_average
            words_list = list(self.sentiment_dataset.get_word_counts().keys())
            self.sent_func_kwargs = {
                "word_to_vec": create_or_load_slim_w2v(words_list),
                "embedding_dim": embedding_dim
            }
        else:
            raise ValueError("invalid data_type: {}".format(data_type))
        # map data splits to torch datasets and iterators
        self.torch_datasets = {
            k: OnlineDataset(sentences, self.sent_func, self.sent_func_kwargs)
            for k, sentences in self.sentences.items()
        }
        self.torch_iterators = {
            k: DataLoader(dataset, batch_size=batch_size, shuffle=k == TRAIN)
            for k, dataset in self.torch_datasets.items()
        }