def test_transformer_batch_tgt_masking(): src_tokens = [["the", "cow", "jumped", "over", "the", "moon"], ["the", "british", "are", "coming"]] tgt_tokens = [["la", "vache", "a", "sauté", "sur", "la", "lune"], ["les", "britanniques", "arrivent"]] batch_size = len(src_tokens) dictionary_source = NLPVocabulary.build_vocabulary(src_tokens) dictionary_target = NLPVocabulary.build_vocabulary(tgt_tokens) max_seq_length = 10 src_padded = TransformerDataset.padded_string_to_integer( src_tokens, max_seq_length, dictionary_source) tgt_padded = TransformerDataset.padded_string_to_integer( tgt_tokens, max_seq_length + 1, dictionary_target) batched_object_data = TransformerBatch(torch.LongTensor(src_padded), torch.LongTensor(tgt_padded)) # test masking of target: blend of padding and auto-regressive masking # full mask at last part of sequence assert torch.equal( batched_object_data.tgt_mask[0, 9, :], torch.BoolTensor( [True, True, True, True, True, True, True, True, False, False])) # should be just the very first item assert torch.equal( batched_object_data.tgt_mask[0, 0, :], torch.BoolTensor([ True, False, False, False, False, False, False, False, False, False ]))
def test_transformer_batch_dimensions(): src_tokens = [["the", "cow", "jumped", "over", "the", "moon"], ["the", "british", "are", "coming"]] tgt_tokens = [["la", "vache", "a", "sauté", "sur", "la", "lune"], ["les", "britanniques", "arrivent"]] batch_size = len(src_tokens) dictionary_source = NLPVocabulary.build_vocabulary(src_tokens) dictionary_target = NLPVocabulary.build_vocabulary(tgt_tokens) max_seq_length = 20 src_padded = TransformerDataset.padded_string_to_integer( src_tokens, max_seq_length, dictionary_source) tgt_padded = TransformerDataset.padded_string_to_integer( tgt_tokens, max_seq_length + 1, dictionary_target) batched_object_data = TransformerBatch(torch.LongTensor(src_padded), torch.LongTensor(tgt_padded)) # test dimensions assert batched_object_data.src.size() == torch.Size( [batch_size, max_seq_length]) assert batched_object_data.src_mask.size() == torch.Size( [batch_size, 1, max_seq_length]) assert batched_object_data.tgt.size() == torch.Size( [batch_size, max_seq_length]) assert batched_object_data.tgt_y.size() == torch.Size( [batch_size, max_seq_length]) assert batched_object_data.tgt_mask.size() == torch.Size( [batch_size, max_seq_length, max_seq_length])
def test_transformer_batch_src_masking(): src_tokens = [["the", "cow", "jumped", "over", "the", "moon"], ["the", "british", "are", "coming"]] tgt_tokens = [["la", "vache", "a", "sauté", "sur", "la", "lune"], ["les", "britanniques", "arrivent"]] batch_size = len(src_tokens) dictionary_source = NLPVocabulary.build_vocabulary(src_tokens) dictionary_target = NLPVocabulary.build_vocabulary(tgt_tokens) max_seq_length = 10 src_padded = TransformerDataset.padded_string_to_integer( src_tokens, max_seq_length, dictionary_source) tgt_padded = TransformerDataset.padded_string_to_integer( tgt_tokens, max_seq_length + 1, dictionary_target) batched_object_data = TransformerBatch(torch.LongTensor(src_padded), torch.LongTensor(tgt_padded)) # test masking # include EOS token assert torch.equal( batched_object_data.src_mask[0, :], torch.BoolTensor( [[True, True, True, True, True, True, True, False, False, False]])) assert torch.equal( batched_object_data.src_mask[1, :], torch.BoolTensor( [[True, True, True, True, True, False, False, False, False, False]]))
def padded_string_to_integer(token_list: List[List[str]], max_sequence_length: int, vocab: NLPVocabulary) -> List[List[int]]: """ Take a sequence of (string) tokens and convert them to a padded set of integers. Args: token_list (List[List[str]]): List of tokens to be converted to indices. max_sequence_length (int): Maximum length of sequence for each target, source sequence. vocab (NLPVocabulary): Dictionary to look up indices for each token. Returns: Sequence of indicies with EOS and PAD indices. """ integer_list = [] for tokens in token_list: integers = [vocab.mask_index] * max_sequence_length # this allows for truncated sequences. # In some problems, we will explicitly through out # datapoints < max_sequence_length prior to this step. integers[:len(tokens)] = [vocab.lookup_token(x) for x in tokens][:len(integers)] # Adding in the EOS token if the sequence is not truncated. if len(tokens) < max_sequence_length: integers[len(tokens)] = vocab.eos_index integer_list.append(integers) return integer_list
def get_training_data(cls, max_sequence_length: int) -> Tuple[AbstractNLPDataset, NLPVocabulary]: """ Download training data from huggingfaces, put into normalized formats. Args: max_sequence_length (int): The max sequence length. Returns: Tuple of the dataset and source and target dictionaries. """ # download the IMDB data from hugginfaces for sentiment analysis dataset = load_dataset("imdb")['train'] # note: targets are {0,1} and the data is not shuffled train_target, train_text = list(dataset.data[0]), list(dataset.data[1]) # convert datatypes to native python train_text = [str(x) for x in train_text] train_target = [x.as_py() for x in train_target] # tokenize the data using our tokenizer train_text = tokenize_corpus_basic(train_text, False) # throw out any data points that are > max_length # train_text = [x for x in train_text if len(x) <= max_sequence_length - 1] # build our vocab on the stripped text vocab = NLPVocabulary.build_vocabulary(train_text) # remove some of the words so dictionary <<75k vocab_small = cls.prune_vocab(vocab, 1.e-6) # convert to into padded sequences of integers train_text = cls.padded_string_to_integer(train_text, max_sequence_length, vocab_small) return cls(list(zip(train_target, train_text)), vocab_small), vocab_small
def test_padded_string_to_integer_conversion(): token_list = [["the", "cow", "jumped", "over", "the", "moon"]] vocab = NLPVocabulary.build_vocabulary(token_list) max_seq_length = 10 padded_integers = TransformerDataset.padded_string_to_integer( token_list, max_seq_length, vocab) assert padded_integers[0] == [3, 4, 5, 6, 3, 7, 2, 0, 0, 0]
def build_vocab(filepath: AnyStr, tokenizer): """ This is a static method that builds an NLPVocabulary object from a file provided. Args: filepath(Anystr): This is a string for the filepath to open to build the vocab. tokenizer(function): This is a function to convert a list of strings into tokens. Returns: A NLPVocabulary object built off this list. """ vocab = NLPVocabulary() with io.open(filepath, encoding="utf8") as f: for string_ in f: vocab.add_many(tokenizer([string_])[0]) return vocab
def test_transformer_regression_test(): utils.set_seed_everywhere() test_2_args = Namespace( num_layers_per_stack=2, dim_model=512, dim_ffn=2048, num_heads=8, max_sequence_length=20, dropout=0.1, ) # mock dataset src_tokens = [["the", "cow", "jumped", "over", "the", "moon"], ["the", "british", "are", "coming"]] tgt_tokens = [["la", "vache", "a", "sauté", "sur", "la", "lune"], ["les", "britanniques", "arrivent"]] batch_size = len(src_tokens) dictionary_source = NLPVocabulary.build_vocabulary(src_tokens) dictionary_target = NLPVocabulary.build_vocabulary(tgt_tokens) max_seq_length = 20 src_padded = TransformerDataset.padded_string_to_integer( src_tokens, max_seq_length, dictionary_source) tgt_padded = TransformerDataset.padded_string_to_integer( tgt_tokens, max_seq_length + 1, dictionary_target) data = TransformerBatch(torch.LongTensor(src_padded), torch.LongTensor(tgt_padded)) model = transformer.Transformer(len(dictionary_source), len(dictionary_target), test_2_args.num_layers_per_stack, test_2_args.dim_model, test_2_args.dim_ffn, test_2_args.num_heads, test_2_args.max_sequence_length, test_2_args.dropout) # push through model y_hat = model(data) # expected output expected_output = transformer_regression_test_data.TRANSFORMER_REGRESSION_TEST_DATA # assert y_hat is within eps eps = 1.e-4 assert np.allclose(y_hat.data.numpy(), expected_output.data.numpy(), atol=eps)
def test_transformer_dataset_returns_two_tensors(): src_tokens = [["the", "cow", "jumped", "over", "the", "moon"], ["the", "british", "are", "coming"]] tgt_tokens = [["la", "vache", "a", "sauté", "sur", "la", "lune"], ["les", "britanniques", "arrivent"]] dictionary_source = NLPVocabulary.build_vocabulary(src_tokens) dictionary_target = NLPVocabulary.build_vocabulary(tgt_tokens) max_seq_length = 20 src_padded = TransformerDataset.padded_string_to_integer( src_tokens, max_seq_length, dictionary_source) tgt_padded = TransformerDataset.padded_string_to_integer( tgt_tokens, max_seq_length + 1, dictionary_target) dataset = TransformerDataset(list(zip(src_padded, tgt_padded)), dictionary_source) batch = dataset[1] assert type(batch[0]) == torch.Tensor and type( batch[1]) == torch.Tensor and len(batch) == 2
def test_input_output_dims_transformer(): test_1_args = Namespace( num_layers_per_stack=2, dim_model=512, dim_ffn=2048, num_heads=8, max_sequence_length=20, dropout=0.1, ) # mock dataset src_tokens = [["the", "cow", "jumped", "over", "the", "moon"], ["the", "british", "are", "coming"]] tgt_tokens = [["la", "vache", "a", "sauté", "sur", "la", "lune"], ["les", "britanniques", "arrivent"]] batch_size = len(src_tokens) dictionary_source = NLPVocabulary.build_vocabulary(src_tokens) dictionary_target = NLPVocabulary.build_vocabulary(tgt_tokens) max_seq_length = 20 src_padded = TransformerDataset.padded_string_to_integer( src_tokens, max_seq_length, dictionary_source) tgt_padded = TransformerDataset.padded_string_to_integer( tgt_tokens, max_seq_length + 1, dictionary_target) data = TransformerBatch(torch.LongTensor(src_padded), torch.LongTensor(tgt_padded)) model = transformer.Transformer(len(dictionary_source), len(dictionary_target), test_1_args.num_layers_per_stack, test_1_args.dim_model, test_1_args.dim_ffn, test_1_args.num_heads, test_1_args.max_sequence_length, test_1_args.dropout) # push through model y_hat = model(data) # assert all dimensions are correct assert y_hat.size() == torch.Size( [batch_size, max_seq_length, len(dictionary_target)])
def get_target_context_data(cls, train_text: List, dictionary: NLPVocabulary, context_size: int, train: bool) -> List: """ Class method to take list of tokenized text and convert into sub-sampled (input,context) pairs. Note that sub-sampling only happens on the training dataset (see Mikolov et al. for details). Args: train_text (list): list of tokenized data to be used to derive (input,context) pairs. dictionary (NLPVocabulary): a dictionary built off of the training data to map tokens <-> idxs. context_size (int): the window around each input word to derive context pairings. train (bool): a "train" flag to indicate we want to sub-sample the training set. Returns: list of (input_idx, context_idx) pairs to be used for negative sampling loss problem. """ train_data = [] word_probas = dictionary.get_word_discard_probas() for tokens in train_text: tokens = [dictionary.lookup_token(x) for x in tokens] train_data.extend( cls.get_skipgram_context(tokens, context_size, word_probas, train)) return train_data
def prune_vocab(cls, vocab: NLPVocabulary, prob_thresh: float) -> NLPVocabulary: """ A simple method that reduces the dictionary of a corpus to be more manageable. Args: vocab (NLPVocabulary): The original dictionary. prob_thresh (float): threshold of word frequency over which to keep tokens. Returns: Pruned dictionary. """ word_probas = vocab.get_word_frequencies() # special tokens have 0 word_counts # this is a hard-coded hyper-parameter keep_words = word_probas > prob_thresh idx_to_token = vocab.idx_to_token keep_tokens = [] for idx, keep in enumerate(keep_words): if keep: keep_tokens.append(idx_to_token[idx]) # re-build the dictionary vocab = NLPVocabulary.build_vocabulary([keep_tokens]) return vocab
def get_training_data( cls, block_size: int) -> Tuple[AbstractNLPDataset, NLPVocabulary]: """ Returns the dataset class along with vocabulary object. Args: block_size (int): The size of the context window. Returns: Tuple of the dataset and dictionary. """ # download the huggingfaces::wikitext language model development train_dataset = load_dataset("wikitext", 'wikitext-2-raw-v1')['train'] # flatten the pyarrow chunks into one string train_dataset = [" ".join([str(x) for x in train_dataset._data[0]])] train_dataset = tokenize_corpus_basic(train_dataset, False) # hack: i'm going to only grab the first 300k examples. cause this is like > 1MM words # build vocabulary vocab = NLPVocabulary.build_vocabulary([train_dataset[0]]) train_dataset = torch.LongTensor( [vocab.token_to_idx[x] for x in train_dataset[0]]) # we pass the dataset, vocab... Dataset will do the rest return cls(train_dataset, vocab, block_size), vocab
def get_training_data( cls, *args: Any) -> Tuple[AbstractNLPDataset, NLPVocabulary]: """ Class method to generate the training dataset (derived from hugging faces "ag_news"). This method grabs the raw text, tokenizes and cleans up the data, generates a dictionary, and generates a sub-sampled (input,context) pair for training. Returns: (NLPDataset,NLPVocabulary) tuple to be used downstream in training. """ context_size, thresh = args # Using the Ag News data via Hugging Faces train_text = load_dataset("ag_news")['train']['text'] train_text = tokenize_corpus_basic(train_text) dictionary = NLPVocabulary.build_vocabulary(train_text) # for sub-sampling dictionary.set_proba_thresh(thresh) train_data = cls.get_target_context_data(train_text, dictionary, context_size, train=True) return cls(train_data), dictionary