def load_data_nmt(batch_size, num_steps, num_examples=600):
    """Return the iterator and the vocabularies of the translation dataset."""
    text = preprocess_nmt(read_data_nmt())
    source, target = tokenize_nmt(text, num_examples)
    src_vocab = d2l.Vocab(source, min_freq=2,
                          reserved_tokens=['<pad>', '<bos>', '<eos>'])
    tgt_vocab = d2l.Vocab(target, min_freq=2,
                          reserved_tokens=['<pad>', '<bos>', '<eos>'])
    src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps)
    tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps)
    data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len)
    data_iter = d2l.load_array(data_arrays, batch_size)
    return data_iter, src_vocab, tgt_vocab
Exemplo n.º 2
0
 def __init__(self, paragraphs, max_len):
     # Input `paragraphs[i]` is a list of sentence strings representing a
     # paragraph; while output `paragraphs[i]` is a list of sentences
     # representing a paragraph, where each sentence is a list of tokens
     paragraphs = [
         d2l.tokenize(paragraph, token='word') for paragraph in paragraphs
     ]
     sentences = [
         sentence for paragraph in paragraphs for sentence in paragraph
     ]
     self.vocab = d2l.Vocab(
         sentences,
         min_freq=5,
         reserved_tokens=['<pad>', '<mask>', '<cls>', '<sep>'])
     # Get data for the next sentence prediction task
     examples = []
     for paragraph in paragraphs:
         examples.extend(
             _get_nsp_data_from_paragraph(paragraph, paragraphs, self.vocab,
                                          max_len))
     # Get data for the masked language model task
     examples = [(_get_mlm_data_from_tokens(tokens, self.vocab) +
                  (segments, is_next))
                 for tokens, segments, is_next in examples]
     # Pad inputs
     (self.all_token_ids, self.all_segments, self.valid_lens,
      self.all_pred_positions, self.all_mlm_weights, self.all_mlm_labels,
      self.nsp_labels) = _pad_bert_inputs(examples, max_len, self.vocab)
Exemplo n.º 3
0
def load_pretrained_model(pretrained_model, num_hiddens, ffn_num_hiddens,
                          num_heads, num_layers, dropout, max_len, devices):
    data_dir = d2l.download_extract(pretrained_model)
    # Define an empty vocabulary to load the predefined vocabulary
    vocab = d2l.Vocab()
    vocab.idx_to_token = json.load(open(os.path.join(data_dir, 'vocab.json')))
    vocab.token_to_idx = {
        token: idx
        for idx, token in enumerate(vocab.idx_to_token)
    }
    bert = d2l.BERTModel(len(vocab),
                         num_hiddens,
                         norm_shape=[256],
                         ffn_num_input=256,
                         ffn_num_hiddens=ffn_num_hiddens,
                         num_heads=4,
                         num_layers=2,
                         dropout=0.2,
                         max_len=max_len,
                         key_size=256,
                         query_size=256,
                         value_size=256,
                         hid_in_features=256,
                         mlm_in_features=256,
                         nsp_in_features=256)
    # Load pretrained BERT parameters
    bert.load_state_dict(
        torch.load(os.path.join(data_dir, 'pretrained.params')))
    return bert, vocab
Exemplo n.º 4
0
def load_data_ptb(batch_size, max_window_size, num_noise_words):
    num_workers = d2l.get_dataloader_workers()
    sentences = read_ptb()
    vocab = d2l.Vocab(sentences, min_freq=10)
    subsampled = subsampling(sentences, vocab)
    corpus = [vocab[line] for line in subsampled]
    all_centers, all_contexts = get_centers_and_contexts(
        corpus, max_window_size)
    all_negatives = get_negatives(all_contexts, corpus, num_noise_words)

    class PTBDataset(torch.utils.data.Dataset):
        def __init__(self, centers, contexts, negatives):
            assert len(centers) == len(contexts) == len(negatives)
            self.centers = centers
            self.contexts = contexts
            self.negatives = negatives

        def __getitem__(self, index):
            return (self.centers[index], self.contexts[index],
                    self.negatives[index])

        def __len__(self):
            return len(self.centers)

    dataset = PTBDataset(all_centers, all_contexts, all_negatives)

    data_iter = torch.utils.data.DataLoader(dataset,
                                            batch_size,
                                            shuffle=True,
                                            collate_fn=batchify,
                                            num_workers=num_workers)
    return data_iter, vocab
Exemplo n.º 5
0
def load_corpus_war_of_the_worlds(max_tokens=-1):
    """Return token indices and the vocabulary of the time machine dataset."""
    lines = read_war_of_the_worlds()
    tokens = d2l.tokenize(lines, 'char')
    vocab = d2l.Vocab(tokens)
    # Since each text line in the time machine dataset is not necessarily a
    # sentence or a paragraph, flatten all the text lines into a single list
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab
 def __init__(self, dataset, num_steps, vocab=None):
     self.num_steps = num_steps
     all_premise_tokens = d2l.tokenize(dataset[0])
     if vocab is None:
         self.vocab = d2l.Vocab(all_premise_tokens + all_premise_tokens,
                                min_freq=5,
                                reserved_tokens=['<pad>'])
     else:
         self.vocab = vocab
     self.premises = self._pad(all_premise_tokens)
     self.hypotheses = self._pad(all_premise_tokens)
     self.labels = torch.tensor(dataset[2])
     print('read ' + str(len(self.premises)) + ' examples')
def load_data_imdb(batch_size, num_steps=500):
    data_dir = d2l.download_extract('aclImdb','aclImdb')
    train_data = read_imdb(data_dir, True)
    test_data = read_imdb(data_dir, False)
    train_tokens = d2l.tokenize(train_data[0], token='word')
    test_tokens = d2l.tokenize(test_data[0], token='word')
    vocab = d2l.Vocab(train_tokens, min_freq=5)
    train_features = torch.tensor([d2l.truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
    test_features = torch.tensor([d2l.truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in test_tokens])
    train_iter = d2l.load_array((train_features, torch.tensor(train_data[1])),
                                batch_size)
    test_iter = d2l.load_array((test_features, torch.tensor(test_data[1])),
                               batch_size,
                               is_train=False)
    return train_iter, test_iter, vocab
Exemplo n.º 8
0
    return source, target


source, target = tokenize_nmt(text)
print('source:', source[:6], 'target:', target[:6])

d2l.set_figsize()
_, _, patches = d2l.plt.hist([[len(l)
                               for l in source], [len(l) for l in target]],
                             label=['source', 'target'])
for patch in patches[1].patches:
    patch.set_hatch('/')
d2l.plt.legend(loc='upper right')

src_vocab = d2l.Vocab(source,
                      min_freq=2,
                      reserved_tokens=['<pad>', '<bos>', '<eos>'])

# len(src_vocab)


#@save
def truncate_pad(line, num_steps, padding_token):
    """Truncate or pad sequences."""
    if len(line) > num_steps:
        return line[:num_steps]  # Truncate
    return line + [padding_token] * (num_steps - len(line))  # Pad

    print('truncate_pad:',
          truncate_pad(src_vocab[source[0]], 10, src_vocab['<pad>']))
Exemplo n.º 9
0
import collections
import re
from d2l import torch as d2l
import random
import torch

tokens = d2l.tokenize(d2l.read_time_machine())
corpus = [token for line in tokens for token in line]
vocab = d2l.Vocab(corpus)
freqs = [freq for _, freq in vocab.token_freqs]

bigram_tokens = [pair for pair in zip(corpus[:-1], corpus[1:])]
bigram_vocab = d2l.Vocab(bigram_tokens)
print(bigram_vocab.token_freqs[:10])
bifreqs = [freq for _, freq in bigram_vocab.token_freqs]

trigram_tokens = [tup for tup in zip(corpus[:-2], corpus[1:-1], corpus[2:])]
trigram_vocab = d2l.Vocab(trigram_tokens)
print(trigram_vocab.token_freqs[:10])
trifreqs = [freq for _, freq in trigram_vocab.token_freqs]

d2l.plot([freqs, bifreqs, trifreqs],
         xlabel="token: x",
         ylabel="frequency: n(x)",
         xscale="log",
         yscale="log",
         legend=["unigram", "bigram", "trigram"])
d2l.plt.show()


def seq_data_iter_random(corpus, batch_size, num_steps):
Exemplo n.º 10
0
from d2l import torch as d2l
import torch
import random

tokens = d2l.tokenize(d2l.read_time_machine())
# Since each text line is not necessisarily a sentence or a paragraph, we
# concatenate all text lines
corpus = [token for line in tokens for token in line]
vocab = d2l.Vocab(corpus)
vocab.token_freqs[:10]

Exemplo n.º 11
0
d2l.DATA_HUB['ptb'] = (d2l.DATA_URL + 'ptb.zip',
                       '319d85e578af0cdc590547f26231e4e31cdf1e42')


def read_ptb():
    data_dir = d2l.download_extract('ptb')
    with open(os.path.join(data_dir, 'ptb.train.txt')) as f:
        raw_text = f.read()
    return [line.split() for line in raw_text.split('\n')]


sentences = read_ptb()

print("# sentences: {}".format(len(sentences)))
# %%
vocab = d2l.Vocab(sentences, min_freq=10)
print("vocab size: {}".format(len(vocab)))


# %%
def subsampling(sentences, vocab):
    #map low frequency words into <unk>
    sentences = [[vocab.idx_to_token[vocab[tk]] for tk in line]
                 for line in sentences]
    # count the frequency for each word
    counter = d2l.count_corpus(sentences)
    num_tokens = sum(counter.values())

    # return true if to keep this token during subsampling
    def keep(token):
        return (random.uniform(0, 1) < math.sqrt(
                                    label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n','')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels
#%%
train_data = read_imdb(data_dir, is_train=True)
print('# trainings:', len(train_data[0]))
for x, y in zip(train_data[0][:3], train_data[1][:3]):
    print('label:',y,'review',x[0:60])

# %%
train_tokens = d2l.tokenize(train_data[0], token='word')
vocab = d2l.Vocab(train_tokens, min_freq=5, reserved_tokens=['<pad>'])

d2l.set_figsize()
d2l.plt.hist([len(line) for line in train_tokens], bins=range(0,1000,50))

#%%
num_steps = 500
train_features = torch.tensor([d2l.truncate_pad(
    vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
print(train_features.shape)

#%%
train_iter = d2l.load_array((train_features, torch.tensor(train_data[1])), 64)
for X, y in train_iter:
    print('X:', X.shape, ',y:', y.shape)
    break
Exemplo n.º 13
0
def load_data_ptb(batch_size, max_window_size, num_noise_words):
    num_workers = d2l.get_dataloader_worders()
    sentences = read_ptb()
    vocab = d2l.Vocab(sentences, min_freq=10)