def create_data_loader(batch_size, num_steps, data_path): train_ds, valid_ds, test_ds = load_dataset('ptb', splits=('train', 'valid', 'test')) train_examples = [ train_ds[i]['sentence'].split() for i in range(len(train_ds)) ] vocab = Vocab.build_vocab(train_examples, eos_token='</eos>') # Because the sentences in PTB dataset might be consecutive, we need to concatenate # all texts from our dataset and fold them into chunks while the number of rows is # equal to batch size. For example: # # Sentence1: we're talking about years ago before anyone heard of asbestos having # any questionable properties. # Sentence2: there is no asbestos in our products now. # Batch_size: 5 # Grouped_text: [["we're", "talking", "about", "years"], # ["ago", "before", "anyone", "heard"], # ["of", "asbestos", "having", "any"], # ["questionable", "properties", "there", "is"], # ["no", "asbestos", "in", "our"]] # def group_texts(examples): concat_examples = [] for example in examples: concat_examples += example['sentence'].split() + ['</eos>'] concat_examples = vocab.to_indices(concat_examples) max_seq_len = len(concat_examples) // batch_size reshaped_examples = np.asarray(concat_examples[0:batch_size * max_seq_len], dtype='int64').reshape( (batch_size, max_seq_len)) encoded_examples = [] for i in range(max_seq_len // num_steps): encoded_examples.append( (np.copy(reshaped_examples[:, i * num_steps:(i + 1) * num_steps]), np.copy(reshaped_examples[:, i * num_steps + 1:(i + 1) * num_steps + 1]))) return encoded_examples train_ds.map(group_texts, batched=True) valid_ds.map(group_texts, batched=True) test_ds.map(group_texts, batched=True) train_loader = paddle.io.DataLoader(train_ds, return_list=True, batch_size=None) valid_loader = paddle.io.DataLoader(valid_ds, return_list=True, batch_size=None) test_loader = paddle.io.DataLoader(test_ds, return_list=True, batch_size=None) return train_loader, valid_loader, test_loader, len(vocab)
def get_vocab(cls, files, max_size=None, min_freq=0, lower_case=True, delimiter=None, unk_token=None, pad_token=None, bos_token=None, eos_token=None, **kwargs): return Vocab.build_vocab(cls.data_iterator(files=files, delimiter=delimiter, lower_case=lower_case), max_size=max_size, min_freq=min_freq, unk_token=unk_token, pad_token=pad_token, bos_token=bos_token, eos_token=eos_token)
def create_data_loader(args): batch_size = args.batch_size max_len = args.max_len if args.dataset == 'yahoo': train_ds, dev_ds, test_ds = load_dataset('yahoo_answer_100k', splits=('train', 'valid', 'test')) vocab = Vocab.load_vocabulary(**train_ds.vocab_info) else: train_ds, dev_ds, test_ds = load_dataset('ptb', splits=('train', 'valid', 'test')) examples = [ train_ds[i]['sentence'].split() for i in range(len(train_ds)) ] vocab = Vocab.build_vocab(examples) vocab_size = len(vocab) bos_id = vocab_size eos_id = vocab_size + 1 pad_id = vocab_size + 1 def convert_example(example): features = vocab.to_indices(example['sentence'].split()[:max_len]) return features key = (lambda x, data_source: len(data_source[x])) # Truncate and convert example to ids train_ds = train_ds.map(convert_example, lazy=False) dev_ds = dev_ds.map(convert_example, lazy=False) test_ds = test_ds.map(convert_example, lazy=False) train_batch_sampler = SamplerHelper(train_ds).shuffle().sort( key=key, buffer_size=batch_size * 20).batch(batch_size=batch_size) dev_batch_sampler = SamplerHelper(dev_ds).sort( key=key, buffer_size=batch_size * 20).batch(batch_size=batch_size) test_batch_sampler = SamplerHelper(dev_ds).sort( key=key, buffer_size=batch_size * 20).batch(batch_size=batch_size) train_loader = paddle.io.DataLoader(train_ds, batch_sampler=train_batch_sampler, collate_fn=partial(prepare_train_input, bos_id=bos_id, eos_id=eos_id, pad_id=pad_id)) dev_loader = paddle.io.DataLoader(dev_ds, batch_sampler=dev_batch_sampler, collate_fn=partial(prepare_train_input, bos_id=bos_id, eos_id=eos_id, pad_id=pad_id)) test_loader = paddle.io.DataLoader(dev_ds, batch_sampler=dev_batch_sampler, collate_fn=partial(prepare_train_input, bos_id=bos_id, eos_id=eos_id, pad_id=pad_id)) return train_loader, dev_loader, test_loader, vocab, bos_id, pad_id, len( train_ds)
def build_vocab(corpus, tokenizer, encoding_model, feat): """ Build vocabs use the api of paddlenlp.data.Vocab.build_vocab(), Using token_to_idx to specifies the mapping relationship between tokens and indices to be used. Args: Corpus(obj:`list[list[str]]`): The training corpus which contains list of input words, features and relations. tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. If the encoding model is lstm, tokenizer is None. encoding_model(obj:`str`): The encoder used for embedding. feat(obj:`str`): The features used for model inputs. If the encoding model is lstm, feat can be `pos` or `char`, otherwise the feat is None. Returns: word_vocab(obj:`Vocab`): Word vocab. feat_vocab(obj:`Vocab`): Feature vocab. rel_vocab(obj:`Vocab`): Relation vocab. """ word_examples, feat_examples, rel_examples = corpus # Build word vocab and feature vocab if encoding_model == "lstm": # Using token_to_idx to specifies the mapping # relationship between tokens and indices word_vocab = Vocab.build_vocab( word_examples, min_freq=2, token_to_idx={ "[PAD]": 0, "[UNK]": 1, "[BOS]": 2, "[EOS]": 3 }, unk_token="[UNK]", pad_token="[PAD]", bos_token="[BOS]", eos_token="[EOS]", ) if feat == "pos": feat_vocab = Vocab.build_vocab( feat_examples, token_to_idx={ "[BOS]": 0, "[EOS]": 1 }, bos_token="[BOS]", eos_token="[EOS]", ) else: feat_vocab = Vocab.build_vocab( feat_examples, token_to_idx={ "[PAD]": 0, "[UNK]": 1, "[BOS]": 2, "[EOS]": 3 }, unk_token="[UNK]", pad_token="[PAD]", bos_token="[BOS]", eos_token="[EOS]", ) else: word_vocab = tokenizer.vocab feat_vocab = None # Build relation vocab rel_vocab = Vocab.build_vocab( rel_examples, token_to_idx={ "[BOS]": 0, "[EOS]": 1, "[UNK]": 2 }, bos_token="[BOS]", eos_token="[EOS]", unk_token="[UNK]", ) return word_vocab, feat_vocab, rel_vocab
from paddlenlp.datasets.experimental import PTB from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer from paddlenlp.data import Stack, Tuple, Pad, Dict, Vocab import numpy as np from functools import partial from paddle.io import DataLoader from paddlenlp.datasets.experimental import load_dataset train_ds, valid_ds, test_ds = load_dataset('ptb', splits=('train', 'valid', 'test')) train_examples = [ train_ds[i]['sentence'].split() for i in range(len(train_ds)) ] vocab = Vocab.build_vocab(train_examples, eos_token='</eos>') batch_size = 8 num_steps = 35 def group_texts(examples): concat_examples = [] for example in examples: concat_examples += example['sentence'].split() + ['</eos>'] concat_examples = vocab.to_indices(concat_examples) max_seq_len = len(concat_examples) // batch_size reshaped_examples = np.asarray(concat_examples[0:batch_size * max_seq_len], dtype='int64').reshape( (batch_size, max_seq_len))