Exemplo n.º 1
0
def load_pretrained_model(pretrained_model, num_hiddens, ffn_num_hiddens,
                          num_heads, num_layers, dropout, max_len, devices):
    data_dir = d2l.download_extract(pretrained_model)
    # Define an empty vocabulary to load the predefined vocabulary
    vocab = d2l.Vocab()
    vocab.idx_to_token = json.load(open(os.path.join(data_dir, 'vocab.json')))
    vocab.token_to_idx = {
        token: idx
        for idx, token in enumerate(vocab.idx_to_token)
    }
    bert = d2l.BERTModel(len(vocab),
                         num_hiddens,
                         norm_shape=[256],
                         ffn_num_input=256,
                         ffn_num_hiddens=ffn_num_hiddens,
                         num_heads=4,
                         num_layers=2,
                         dropout=0.2,
                         max_len=max_len,
                         key_size=256,
                         query_size=256,
                         value_size=256,
                         hid_in_features=256,
                         mlm_in_features=256,
                         nsp_in_features=256)
    # Load pretrained BERT parameters
    bert.load_state_dict(
        torch.load(os.path.join(data_dir, 'pretrained.params')))
    return bert, vocab
Exemplo n.º 2
0
def load_data_wiki(batch_size, max_len):
    num_workers = d2l.get_dataloader_workers()
    data_dir = d2l.download_extract('wikitext-2', 'wikitext-2')
    paragraphs = _read_wiki(data_dir)
    train_set = _WikiTextDataset(paragraphs, max_len)
    train_iter = DataLoader(train_set,
                            batch_size,
                            shuffle=True,
                            num_workers=num_workers)
    return train_iter, train_set.vocab
Exemplo n.º 3
0
 def _load_embedding(self, embedding_name):
     idx_to_token, idx_to_vec = ['<unk>'], []
     data_dir = d2l.download_extract(embedding_name)
     with open(os.path.join(data_dir, 'vec.txt'), 'r') as f:
         for line in f:
             elems = line.rstrip().split(' ')
             token, elems = elems[0], [float(elem) for elem in elems[1:]]
             # skip header information, such as the top row in fasttext
             if len(elems) > 1:
                 idx_to_token.append(token)
                 idx_to_vec.append(elems)
     idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
     return idx_to_token, torch.tensor(idx_to_vec)
Exemplo n.º 4
0
 def _load_embedding(self, embedding_name):
     idx_to_token, idx_to_vec = ['<unk>'], []
     data_dir = d2l.download_extract(embedding_name)
     # GloVe website: https://nlp.stanford.edu/projects/glove/
     # fastText website: https://fasttext.cc/
     with open(os.path.join(data_dir, 'vec.txt'), 'r') as f:
         for line in f:
             elems = line.rstrip().split(' ')
             token, elems = elems[0], [float(elem) for elem in elems[1:]]
             if len(elems) > 1:
                 idx_to_token.append(token)
                 idx_to_vec.append(elems)
     idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
     return idx_to_token, torch.tensor(idx_to_vec)
def load_data_snli(batch_size, num_steps=50):
    """Download the SNLI dataset and return data iterators and vocabulary."""
    num_workers = d2l.get_dataloader_workers()
    data_dir = d2l.download_extract('SNLI')
    train_data = read_snli(data_dir, True)
    test_data = read_snli(data_dir, False)
    train_set = SNLIDataset(train_data, num_steps)
    test_set = SNLIDataset(test_data, num_steps, train_set.vocab)
    train_iter = torch.utils.data.DataLoader(train_set,
                                             batch_size,
                                             shuffle=True,
                                             num_workers=num_workers)
    test_iter = torch.utils.data.DataLoader(test_set,
                                            batch_size,
                                            shuffle=False,
                                            num_workers=num_workers)
    return train_iter, test_iter, train_set.vocab
def load_data_imdb(batch_size, num_steps=500):
    data_dir = d2l.download_extract('aclImdb','aclImdb')
    train_data = read_imdb(data_dir, True)
    test_data = read_imdb(data_dir, False)
    train_tokens = d2l.tokenize(train_data[0], token='word')
    test_tokens = d2l.tokenize(test_data[0], token='word')
    vocab = d2l.Vocab(train_tokens, min_freq=5)
    train_features = torch.tensor([d2l.truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
    test_features = torch.tensor([d2l.truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in test_tokens])
    train_iter = d2l.load_array((train_features, torch.tensor(train_data[1])),
                                batch_size)
    test_iter = d2l.load_array((test_features, torch.tensor(test_data[1])),
                               batch_size,
                               is_train=False)
    return train_iter, test_iter, vocab
Exemplo n.º 7
0
def get_pokemon_dataset() -> DataLoader:
    d2l.DATA_HUB['pokemon'] = (d2l.DATA_URL + 'pokemon.zip',
                               'c065c0e2593b8b161a2d7873e42418bf6a21106c')
    data_dir = d2l.download_extract('pokemon')
    pokemon = torchvision.datasets.ImageFolder(data_dir)

    batch_size = 256
    transformer = torchvision.transforms.Compose([
        torchvision.transforms.Resize((64, 64)),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(0.5, 0.5)
    ])
    pokemon.transform = transformer

    data_loader = torch.utils.data.DataLoader(pokemon,
                                              batch_size=batch_size,
                                              shuffle=True,
                                              num_workers=2)
    return data_loader
Exemplo n.º 8
0
def read_data_nmt():
    """Load the English-French dataset."""
    data_dir = d2l.download_extract('fra-eng')
    with open(os.path.join(data_dir, 'fra.txt'), 'r') as f:
        return f.read()
#%%
from d2l import torch as d2l
import torch
from torch import nn
import os
import re

#@save
d2l.DATA_HUB['SNLI'] = ('https://nlp.stanford.edu/projects/snli/snli_1.0.zip',
                        '9fcde07509c7e87ec61c640c1b2753d9041758e4')

data_dir = d2l.download_extract('SNLI')


#%%
#@save
def read_snli(data_dir, is_train):
    """Read the SNLI dataset into premises, hypotheses, and labels."""
    def extract_text(s):
        # Remove information that will not be used by us
        s = re.sub('\\(', '', s)
        s = re.sub('\\)', '', s)
        # Substitute two or more consecutive whitespace with space
        s = re.sub('\\s{2,}', ' ', s)
        return s.strip()

    label_set = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
    file_name = os.path.join(
        data_dir, 'snli_1.0_train.txt' if is_train else 'snli_1.0_test.txt')
    with open(file_name, 'r') as f:
        rows = [row.split('\t') for row in f.readlines()[1:]]
Exemplo n.º 10
0
import matplotlib.cm as cm
from d2l import torch as d2l
import os

VOC_COLORMAP = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
                [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
                [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
                [64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128],
                [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0],
                [0, 64, 128]]
VOC_CLASSES = ['background', 'aeroplane', 'bicycle', 'bird', 'boat',
               'bottle', 'bus', 'car', 'cat', 'chair', 'cow',
               'diningtable', 'dog', 'horse', 'motorbike', 'person',
               'potted plant', 'sheep', 'sofa', 'train', 'tv/monitor']

voc_dir = d2l.download_extract('voc2012', 'VOCdevkit/VOC2012')


def read_voc_images(_voc_dir, is_train=False):
    """Read all VOC feature and label images."""
    txt_fname = os.path.join(_voc_dir, 'ImageSets', 'Segmentation',
                             'train.txt' if is_train else 'val.txt')
    mode = torchvision.io.image.ImageReadMode.RGB
    with open(txt_fname, 'r') as f:
        images = f.read().split()
    features, labels = [], []
    for i, fname in enumerate(images):
        features.append(torchvision.io.read_image(os.path.join(
            _voc_dir, 'JPEGImages', f'{fname}.jpg')))
        labels.append(torchvision.io.read_image(os.path.join(
            _voc_dir, 'SegmentationClass', f'{fname}.png'), mode))
Exemplo n.º 11
0
from d2l import torch as d2l
import torch
import torchvision
from torch import nn
import os


#@save
d2l.DATA_HUB['dog_tiny'] = (d2l.DATA_URL + 'kaggle_dog_tiny.zip',
                            '0cb91d09b814ecdc07b50f31f8dcad3e81d6a86d')

# If you use the full dataset downloaded for the Kaggle competition, change
# the variable below to False
demo = True
if demo:
    data_dir = d2l.download_extract('dog_tiny')
else:
    data_dir = os.path.join('..', 'data', 'dog-breed-identification')


def reorg_dog_data(data_dir, valid_ratio):
    labels = d2l.read_csv_labels(os.path.join(data_dir, 'labels.csv'))
    d2l.reorg_train_valid(data_dir, labels, valid_ratio)
    d2l.reorg_test(data_dir)


batch_size = 4 if demo else 128
valid_ratio = 0.1
reorg_dog_data(data_dir, valid_ratio)

Exemplo n.º 12
0
def read_ptb():
    data_dir = d2l.download_extract('ptb')
    with open(os.path.join(data_dir, 'ptb.train.txt')) as f:
        raw_text = f.read()
    return [line.split() for line in raw_text.split('\n')]
#%%
from d2l import torch as d2l
import torch
from torch import nn
import os 

#%%@save
d2l.DATA_HUB['aclImdb'] = (
    'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',
    '01ada507287d82875905620988597833ad4e0903')
#%%
data_dir = d2l.download_extract('aclImdb', 'aclImdb')

#%%
def read_imdb(data_dir, is_train):
    data, labels = [], []
    for label in ('pos','neg'):
        folder_name = os.path.join(data_dir, 'train' if is_train else 'test',
                                    label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n','')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels
#%%
train_data = read_imdb(data_dir, is_train=True)
print('# trainings:', len(train_data[0]))
for x, y in zip(train_data[0][:3], train_data[1][:3]):
    print('label:',y,'review',x[0:60])