def main(): # Use a GPU if available, as it should be faster. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Using device: " + str(device)) # Load the training dataset, and create a data loader to generate a batch. textField = PreProcessing.text_field labelField = data.Field(sequential=False) train, dev = IMDB.splits(textField, labelField, train="train", validation="dev") textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50)) labelField.build_vocab(train, dev) trainLoader, testLoader = data.BucketIterator.splits( (train, dev), shuffle=True, batch_size=64, sort_key=lambda x: len(x.text), sort_within_batch=True) net = Network().to(device) criterion = lossFunc() optimiser = topti.Adam( net.parameters(), lr=0.001) # Minimise the loss using the Adam algorithm. for epoch in range(10): running_loss = 0 for i, batch in enumerate(trainLoader): # Get a batch and potentially send it to GPU memory. inputs, length, labels = textField.vocab.vectors[batch.text[0]].to( device), batch.text[1].to(device), batch.label.type( torch.FloatTensor).to(device) labels -= 1 # PyTorch calculates gradients by accumulating contributions to them (useful for # RNNs). Hence we must manually set them to zero before calculating them. optimiser.zero_grad() # Forward pass through the network. output = net(inputs, length) loss = criterion(output, labels) # Calculate gradients. loss.backward() # Minimise the loss according to the gradient. optimiser.step() running_loss += loss.item() if i % 32 == 31: print("Epoch: %2d, Batch: %4d, Loss: %.3f" % (epoch + 1, i + 1, running_loss / 32)) running_loss = 0 num_correct = 0 # Save mode torch.save(net.state_dict(), "./model.pth") print("Saved model") # Evaluate network on the test dataset. We aren't calculating gradients, so disable autograd to speed up # computations and reduce memory usage. with torch.no_grad(): for batch in testLoader: # Get a batch and potentially send it to GPU memory. inputs, length, labels = textField.vocab.vectors[batch.text[0]].to( device), batch.text[1].to(device), batch.label.type( torch.FloatTensor).to(device) labels -= 1 # Get predictions outputs = torch.sigmoid(net(inputs, length)) predicted = torch.round(outputs) num_correct += torch.sum(labels == predicted).item() accuracy = 100 * num_correct / len(dev) print(f"Classification accuracy: {accuracy}")
'the', 'a', 'and', 'this', 'that', 'of', 'to', 'in', 'was', 'as', 'with', 'as', 'it', 'for', 'but', 'on', 'you', 'he', 'his', '' ] for word in x: if word not in html_format: for n in noise: word = word.replace(n, '') for u in uninformatives: word = word.replace(u, '') if word not in prune: c.append(word) return c text_field = data.Field(lower=True, include_lengths=True, batch_first=True, preprocessing=pre) textField = PreProcessing.text_field labelField = data.Field(sequential=False) train = IMDB.splits(textField, labelField, train="train")[0] freq = Counter() for example in train.examples: freq.update(example.text) print(freq.most_common(10)) # 83.7868% # uninformatives = ['the', 'a', 'and', 'this', 'that', 'of', 'to', 'in', 'was', 'as', 'with', 'as', 'it', 'for', 'but', # 'on', 'you', 'he']
def main(): # Use a GPU if available, as it should be faster. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Using device: " + str(device)) # Load the training dataset, and create a data loader to generate a batch. textField = data.Field(lower=True, include_lengths=True, batch_first=True) labelField = data.Field(sequential=False) from imdb_dataloader import IMDB train, dev = IMDB.splits(textField, labelField, train="train", validation="dev") textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50)) labelField.build_vocab(train, dev) trainLoader, testLoader = data.BucketIterator.splits( (train, dev), shuffle=True, batch_size=64, sort_key=lambda x: len(x.text), sort_within_batch=True) # Create an instance of the network in memory (potentially GPU memory). Can change to NetworkCnn during development. net = NetworkLstm().to(device) criterion = lossFunc() optimiser = topti.Adam( net.parameters(), lr=0.001) # Minimise the loss using the Adam algorithm. for epoch in range(10): running_loss = 0 for i, batch in enumerate(trainLoader): # Get a batch and potentially send it to GPU memory. inputs, length, labels = textField.vocab.vectors[batch.text[0]].to( device), batch.text[1].to(device), batch.label.type( torch.FloatTensor).to(device) labels -= 1 # PyTorch calculates gradients by accumulating contributions to them (useful for # RNNs). Hence we must manually set them to zero before calculating them. optimiser.zero_grad() # Forward pass through the network. #print(inputs) output = net(inputs, length) loss = criterion(output, labels) # Calculate gradients. loss.backward() # Minimise the loss according to the gradient. optimiser.step() running_loss += loss.item() if i % 32 == 31: print("Epoch: %2d, Batch: %4d, Loss: %.3f" % (epoch + 1, i + 1, running_loss / 32)) running_loss = 0 true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0 # Evaluate network on the test dataset. We aren't calculating gradients, so disable autograd to speed up # computations and reduce memory usage. with torch.no_grad(): net.eval() for batch in testLoader: # Get a batch and potentially send it to GPU memory. inputs, length, labels = textField.vocab.vectors[batch.text[0]].to( device), batch.text[1].to(device), batch.label.type( torch.FloatTensor).to(device) labels -= 1 outputs = net(inputs, length) tp_batch, tn_batch, fp_batch, fn_batch = measures(outputs, labels) true_pos += tp_batch true_neg += tn_batch false_pos += fp_batch false_neg += fn_batch accuracy = 100 * (true_pos + true_neg) / len(dev) matthews = MCC(true_pos, true_neg, false_pos, false_neg) print("Classification accuracy: %.2f%%\n" "Matthews Correlation Coefficient: %.2f" % (accuracy, matthews))
from imdb_dataloader import IMDB import numpy as np import torch import torch.nn as tnn import torch.optim as topti from torchtext import data from torchtext.vocab import GloVe textField = data.Field(lower=True, include_lengths=True, batch_first=True) labelField = data.Field(sequential=False) train, dev = IMDB.splits(textField, labelField, train="train", validation="dev") print(textField) textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50)) labelField.build_vocab(train, dev) trainLoader, testLoader = data.BucketIterator.splits( (train, dev), shuffle=True, batch_size=64, sort_key=lambda x: len(x.text), sort_within_batch=True) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # j = 0 # for i, batch in enumerate(trainLoader):