Python IMDB.splits示例，imdb_dataloader.IMDB.splits Python示例

示例#1

0

显示文件

文件： part3.py 项目： leetable/COMP9444

def main():
    # Use a GPU if available, as it should be faster.
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Using device: " + str(device))

    # Load the training dataset, and create a data loader to generate a batch.
    textField = PreProcessing.text_field
    labelField = data.Field(sequential=False)

    train, dev = IMDB.splits(textField,
                             labelField,
                             train="train",
                             validation="dev")

    textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50))
    labelField.build_vocab(train, dev)

    trainLoader, testLoader = data.BucketIterator.splits(
        (train, dev),
        shuffle=True,
        batch_size=64,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True)

    net = Network().to(device)
    criterion = lossFunc()
    optimiser = topti.Adam(
        net.parameters(),
        lr=0.001)  # Minimise the loss using the Adam algorithm.

    for epoch in range(10):
        running_loss = 0

        for i, batch in enumerate(trainLoader):
            # Get a batch and potentially send it to GPU memory.
            inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(
                device), batch.text[1].to(device), batch.label.type(
                    torch.FloatTensor).to(device)

            labels -= 1

            # PyTorch calculates gradients by accumulating contributions to them (useful for
            # RNNs).  Hence we must manually set them to zero before calculating them.
            optimiser.zero_grad()

            # Forward pass through the network.
            output = net(inputs, length)

            loss = criterion(output, labels)

            # Calculate gradients.
            loss.backward()

            # Minimise the loss according to the gradient.
            optimiser.step()

            running_loss += loss.item()

            if i % 32 == 31:
                print("Epoch: %2d, Batch: %4d, Loss: %.3f" %
                      (epoch + 1, i + 1, running_loss / 32))
                running_loss = 0

    num_correct = 0

    # Save mode
    torch.save(net.state_dict(), "./model.pth")
    print("Saved model")

    # Evaluate network on the test dataset.  We aren't calculating gradients, so disable autograd to speed up
    # computations and reduce memory usage.
    with torch.no_grad():
        for batch in testLoader:
            # Get a batch and potentially send it to GPU memory.
            inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(
                device), batch.text[1].to(device), batch.label.type(
                    torch.FloatTensor).to(device)

            labels -= 1

            # Get predictions
            outputs = torch.sigmoid(net(inputs, length))
            predicted = torch.round(outputs)

            num_correct += torch.sum(labels == predicted).item()

    accuracy = 100 * num_correct / len(dev)

    print(f"Classification accuracy: {accuracy}")

示例#2

0

显示文件

文件： preprocessing_test.py 项目： kaixiang1212/Movie-Review-Classification

            'the', 'a', 'and', 'this', 'that', 'of', 'to', 'in', 'was', 'as',
            'with', 'as', 'it', 'for', 'but', 'on', 'you', 'he', 'his', ''
        ]
        for word in x:
            if word not in html_format:
                for n in noise:
                    word = word.replace(n, '')
                for u in uninformatives:
                    word = word.replace(u, '')
                if word not in prune:
                    c.append(word)
        return c

    text_field = data.Field(lower=True,
                            include_lengths=True,
                            batch_first=True,
                            preprocessing=pre)


textField = PreProcessing.text_field
labelField = data.Field(sequential=False)
train = IMDB.splits(textField, labelField, train="train")[0]
freq = Counter()
for example in train.examples:
    freq.update(example.text)
print(freq.most_common(10))

# 83.7868%
# uninformatives = ['the', 'a', 'and', 'this', 'that', 'of', 'to', 'in', 'was', 'as', 'with', 'as', 'it', 'for', 'but',
#                   'on', 'you', 'he']

示例#3

0

显示文件

def main():
    # Use a GPU if available, as it should be faster.
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Using device: " + str(device))

    # Load the training dataset, and create a data loader to generate a batch.
    textField = data.Field(lower=True, include_lengths=True, batch_first=True)
    labelField = data.Field(sequential=False)

    from imdb_dataloader import IMDB
    train, dev = IMDB.splits(textField,
                             labelField,
                             train="train",
                             validation="dev")

    textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50))
    labelField.build_vocab(train, dev)

    trainLoader, testLoader = data.BucketIterator.splits(
        (train, dev),
        shuffle=True,
        batch_size=64,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True)

    # Create an instance of the network in memory (potentially GPU memory). Can change to NetworkCnn during development.
    net = NetworkLstm().to(device)

    criterion = lossFunc()
    optimiser = topti.Adam(
        net.parameters(),
        lr=0.001)  # Minimise the loss using the Adam algorithm.

    for epoch in range(10):
        running_loss = 0

        for i, batch in enumerate(trainLoader):
            # Get a batch and potentially send it to GPU memory.
            inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(
                device), batch.text[1].to(device), batch.label.type(
                    torch.FloatTensor).to(device)

            labels -= 1

            # PyTorch calculates gradients by accumulating contributions to them (useful for
            # RNNs).  Hence we must manually set them to zero before calculating them.
            optimiser.zero_grad()

            # Forward pass through the network.
            #print(inputs)
            output = net(inputs, length)

            loss = criterion(output, labels)

            # Calculate gradients.
            loss.backward()

            # Minimise the loss according to the gradient.
            optimiser.step()

            running_loss += loss.item()

            if i % 32 == 31:
                print("Epoch: %2d, Batch: %4d, Loss: %.3f" %
                      (epoch + 1, i + 1, running_loss / 32))
                running_loss = 0

    true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0

    # Evaluate network on the test dataset.  We aren't calculating gradients, so disable autograd to speed up
    # computations and reduce memory usage.
    with torch.no_grad():
        net.eval()
        for batch in testLoader:
            # Get a batch and potentially send it to GPU memory.
            inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(
                device), batch.text[1].to(device), batch.label.type(
                    torch.FloatTensor).to(device)

            labels -= 1

            outputs = net(inputs, length)
            tp_batch, tn_batch, fp_batch, fn_batch = measures(outputs, labels)

            true_pos += tp_batch
            true_neg += tn_batch
            false_pos += fp_batch
            false_neg += fn_batch

    accuracy = 100 * (true_pos + true_neg) / len(dev)
    matthews = MCC(true_pos, true_neg, false_pos, false_neg)

    print("Classification accuracy: %.2f%%\n"
          "Matthews Correlation Coefficient: %.2f" % (accuracy, matthews))

示例#4

0

显示文件

from imdb_dataloader import IMDB
import numpy as np

import torch
import torch.nn as tnn
import torch.optim as topti

from torchtext import data
from torchtext.vocab import GloVe

textField = data.Field(lower=True, include_lengths=True, batch_first=True)
labelField = data.Field(sequential=False)
train, dev = IMDB.splits(textField,
                         labelField,
                         train="train",
                         validation="dev")
print(textField)

textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50))
labelField.build_vocab(train, dev)

trainLoader, testLoader = data.BucketIterator.splits(
    (train, dev),
    shuffle=True,
    batch_size=64,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# j = 0
# for i, batch in enumerate(trainLoader):