def load_data_nmt(batch_size, num_steps, num_examples=600): """Return the iterator and the vocabularies of the translation dataset.""" text = preprocess_nmt(read_data_nmt()) source, target = tokenize_nmt(text, num_examples) src_vocab = d2l.Vocab(source, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>']) tgt_vocab = d2l.Vocab(target, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>']) src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps) tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps) data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len) data_iter = d2l.load_array(data_arrays, batch_size) return data_iter, src_vocab, tgt_vocab
def __init__(self, paragraphs, max_len): # Input `paragraphs[i]` is a list of sentence strings representing a # paragraph; while output `paragraphs[i]` is a list of sentences # representing a paragraph, where each sentence is a list of tokens paragraphs = [ d2l.tokenize(paragraph, token='word') for paragraph in paragraphs ] sentences = [ sentence for paragraph in paragraphs for sentence in paragraph ] self.vocab = d2l.Vocab( sentences, min_freq=5, reserved_tokens=['<pad>', '<mask>', '<cls>', '<sep>']) # Get data for the next sentence prediction task examples = [] for paragraph in paragraphs: examples.extend( _get_nsp_data_from_paragraph(paragraph, paragraphs, self.vocab, max_len)) # Get data for the masked language model task examples = [(_get_mlm_data_from_tokens(tokens, self.vocab) + (segments, is_next)) for tokens, segments, is_next in examples] # Pad inputs (self.all_token_ids, self.all_segments, self.valid_lens, self.all_pred_positions, self.all_mlm_weights, self.all_mlm_labels, self.nsp_labels) = _pad_bert_inputs(examples, max_len, self.vocab)
def load_pretrained_model(pretrained_model, num_hiddens, ffn_num_hiddens, num_heads, num_layers, dropout, max_len, devices): data_dir = d2l.download_extract(pretrained_model) # Define an empty vocabulary to load the predefined vocabulary vocab = d2l.Vocab() vocab.idx_to_token = json.load(open(os.path.join(data_dir, 'vocab.json'))) vocab.token_to_idx = { token: idx for idx, token in enumerate(vocab.idx_to_token) } bert = d2l.BERTModel(len(vocab), num_hiddens, norm_shape=[256], ffn_num_input=256, ffn_num_hiddens=ffn_num_hiddens, num_heads=4, num_layers=2, dropout=0.2, max_len=max_len, key_size=256, query_size=256, value_size=256, hid_in_features=256, mlm_in_features=256, nsp_in_features=256) # Load pretrained BERT parameters bert.load_state_dict( torch.load(os.path.join(data_dir, 'pretrained.params'))) return bert, vocab
def load_data_ptb(batch_size, max_window_size, num_noise_words): num_workers = d2l.get_dataloader_workers() sentences = read_ptb() vocab = d2l.Vocab(sentences, min_freq=10) subsampled = subsampling(sentences, vocab) corpus = [vocab[line] for line in subsampled] all_centers, all_contexts = get_centers_and_contexts( corpus, max_window_size) all_negatives = get_negatives(all_contexts, corpus, num_noise_words) class PTBDataset(torch.utils.data.Dataset): def __init__(self, centers, contexts, negatives): assert len(centers) == len(contexts) == len(negatives) self.centers = centers self.contexts = contexts self.negatives = negatives def __getitem__(self, index): return (self.centers[index], self.contexts[index], self.negatives[index]) def __len__(self): return len(self.centers) dataset = PTBDataset(all_centers, all_contexts, all_negatives) data_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True, collate_fn=batchify, num_workers=num_workers) return data_iter, vocab
def load_corpus_war_of_the_worlds(max_tokens=-1): """Return token indices and the vocabulary of the time machine dataset.""" lines = read_war_of_the_worlds() tokens = d2l.tokenize(lines, 'char') vocab = d2l.Vocab(tokens) # Since each text line in the time machine dataset is not necessarily a # sentence or a paragraph, flatten all the text lines into a single list corpus = [vocab[token] for line in tokens for token in line] if max_tokens > 0: corpus = corpus[:max_tokens] return corpus, vocab
def __init__(self, dataset, num_steps, vocab=None): self.num_steps = num_steps all_premise_tokens = d2l.tokenize(dataset[0]) if vocab is None: self.vocab = d2l.Vocab(all_premise_tokens + all_premise_tokens, min_freq=5, reserved_tokens=['<pad>']) else: self.vocab = vocab self.premises = self._pad(all_premise_tokens) self.hypotheses = self._pad(all_premise_tokens) self.labels = torch.tensor(dataset[2]) print('read ' + str(len(self.premises)) + ' examples')
def load_data_imdb(batch_size, num_steps=500): data_dir = d2l.download_extract('aclImdb','aclImdb') train_data = read_imdb(data_dir, True) test_data = read_imdb(data_dir, False) train_tokens = d2l.tokenize(train_data[0], token='word') test_tokens = d2l.tokenize(test_data[0], token='word') vocab = d2l.Vocab(train_tokens, min_freq=5) train_features = torch.tensor([d2l.truncate_pad( vocab[line], num_steps, vocab['<pad>']) for line in train_tokens]) test_features = torch.tensor([d2l.truncate_pad( vocab[line], num_steps, vocab['<pad>']) for line in test_tokens]) train_iter = d2l.load_array((train_features, torch.tensor(train_data[1])), batch_size) test_iter = d2l.load_array((test_features, torch.tensor(test_data[1])), batch_size, is_train=False) return train_iter, test_iter, vocab
return source, target source, target = tokenize_nmt(text) print('source:', source[:6], 'target:', target[:6]) d2l.set_figsize() _, _, patches = d2l.plt.hist([[len(l) for l in source], [len(l) for l in target]], label=['source', 'target']) for patch in patches[1].patches: patch.set_hatch('/') d2l.plt.legend(loc='upper right') src_vocab = d2l.Vocab(source, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>']) # len(src_vocab) #@save def truncate_pad(line, num_steps, padding_token): """Truncate or pad sequences.""" if len(line) > num_steps: return line[:num_steps] # Truncate return line + [padding_token] * (num_steps - len(line)) # Pad print('truncate_pad:', truncate_pad(src_vocab[source[0]], 10, src_vocab['<pad>']))
import collections import re from d2l import torch as d2l import random import torch tokens = d2l.tokenize(d2l.read_time_machine()) corpus = [token for line in tokens for token in line] vocab = d2l.Vocab(corpus) freqs = [freq for _, freq in vocab.token_freqs] bigram_tokens = [pair for pair in zip(corpus[:-1], corpus[1:])] bigram_vocab = d2l.Vocab(bigram_tokens) print(bigram_vocab.token_freqs[:10]) bifreqs = [freq for _, freq in bigram_vocab.token_freqs] trigram_tokens = [tup for tup in zip(corpus[:-2], corpus[1:-1], corpus[2:])] trigram_vocab = d2l.Vocab(trigram_tokens) print(trigram_vocab.token_freqs[:10]) trifreqs = [freq for _, freq in trigram_vocab.token_freqs] d2l.plot([freqs, bifreqs, trifreqs], xlabel="token: x", ylabel="frequency: n(x)", xscale="log", yscale="log", legend=["unigram", "bigram", "trigram"]) d2l.plt.show() def seq_data_iter_random(corpus, batch_size, num_steps):
from d2l import torch as d2l import torch import random tokens = d2l.tokenize(d2l.read_time_machine()) # Since each text line is not necessisarily a sentence or a paragraph, we # concatenate all text lines corpus = [token for line in tokens for token in line] vocab = d2l.Vocab(corpus) vocab.token_freqs[:10]
d2l.DATA_HUB['ptb'] = (d2l.DATA_URL + 'ptb.zip', '319d85e578af0cdc590547f26231e4e31cdf1e42') def read_ptb(): data_dir = d2l.download_extract('ptb') with open(os.path.join(data_dir, 'ptb.train.txt')) as f: raw_text = f.read() return [line.split() for line in raw_text.split('\n')] sentences = read_ptb() print("# sentences: {}".format(len(sentences))) # %% vocab = d2l.Vocab(sentences, min_freq=10) print("vocab size: {}".format(len(vocab))) # %% def subsampling(sentences, vocab): #map low frequency words into <unk> sentences = [[vocab.idx_to_token[vocab[tk]] for tk in line] for line in sentences] # count the frequency for each word counter = d2l.count_corpus(sentences) num_tokens = sum(counter.values()) # return true if to keep this token during subsampling def keep(token): return (random.uniform(0, 1) < math.sqrt(
label) for file in os.listdir(folder_name): with open(os.path.join(folder_name, file), 'rb') as f: review = f.read().decode('utf-8').replace('\n','') data.append(review) labels.append(1 if label == 'pos' else 0) return data, labels #%% train_data = read_imdb(data_dir, is_train=True) print('# trainings:', len(train_data[0])) for x, y in zip(train_data[0][:3], train_data[1][:3]): print('label:',y,'review',x[0:60]) # %% train_tokens = d2l.tokenize(train_data[0], token='word') vocab = d2l.Vocab(train_tokens, min_freq=5, reserved_tokens=['<pad>']) d2l.set_figsize() d2l.plt.hist([len(line) for line in train_tokens], bins=range(0,1000,50)) #%% num_steps = 500 train_features = torch.tensor([d2l.truncate_pad( vocab[line], num_steps, vocab['<pad>']) for line in train_tokens]) print(train_features.shape) #%% train_iter = d2l.load_array((train_features, torch.tensor(train_data[1])), 64) for X, y in train_iter: print('X:', X.shape, ',y:', y.shape) break
def load_data_ptb(batch_size, max_window_size, num_noise_words): num_workers = d2l.get_dataloader_worders() sentences = read_ptb() vocab = d2l.Vocab(sentences, min_freq=10)