def test_character_token_embedder(self): vocab = Dictionary() vocab.add_symbol('hello') vocab.add_symbol('there') embedder = CharacterTokenEmbedder(vocab, [(2, 16), (4, 32), (8, 64), (16, 2)], 64, 5, 2) test_sents = [['hello', 'unk', 'there'], ['there'], ['hello', 'there']] max_len = max(len(s) for s in test_sents) input = torch.LongTensor(len(test_sents), max_len + 2).fill_(vocab.pad()) for i in range(len(test_sents)): input[i][0] = vocab.eos() for j in range(len(test_sents[i])): input[i][j + 1] = vocab.index(test_sents[i][j]) input[i][j + 2] = vocab.eos() embs = embedder(input) assert embs.size() == (len(test_sents), max_len + 2, 5) self.assertAlmostEqual(embs[0][0], embs[1][0]) self.assertAlmostEqual(embs[0][0], embs[0][-1]) self.assertAlmostEqual(embs[0][1], embs[2][1]) self.assertAlmostEqual(embs[0][3], embs[1][1]) embs.sum().backward() assert embedder.char_embeddings.weight.grad is not None
def _get_test_data(self): vocab = Dictionary() vocab.add_symbol("he@@") vocab.add_symbol("llo") vocab.add_symbol("how") vocab.add_symbol("are") vocab.add_symbol("y@@") vocab.add_symbol("ou") vocab.add_symbol("n@@") vocab.add_symbol("ew") vocab.add_symbol("or@@") vocab.add_symbol("k") src_tokens = [ ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"], ["how", "are", "y@@", "ou"], ] src_len = [len(x) for x in src_tokens] x = torch.LongTensor(len(src_tokens), max(src_len) + 1).fill_(vocab.pad()) for i in range(len(src_tokens)): for j in range(len(src_tokens[i])): x[i][j] = vocab.index(src_tokens[i][j]) x[i][j + 1] = vocab.eos() x = x.transpose(1, 0) return vocab, x, torch.LongTensor([i + 1 for i in src_len])
def _get_test_data(self, append_eos=True): vocab = Dictionary() vocab.add_symbol("he@@") vocab.add_symbol("llo") vocab.add_symbol("how") vocab.add_symbol("are") vocab.add_symbol("y@@") vocab.add_symbol("ou") vocab.add_symbol("n@@") vocab.add_symbol("ew") vocab.add_symbol("or@@") vocab.add_symbol("k") src_tokens = [ ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"], ["how", "are", "y@@", "ou"], ] src_len = [len(x) for x in src_tokens] # If we have to append EOS, we include EOS in counting src length if append_eos: src_len = [length + 1 for length in src_len] x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad()) for i in range(len(src_tokens)): for j in range(len(src_tokens[i])): x[i][j] = vocab.index(src_tokens[i][j]) if append_eos: x[i][j + 1] = vocab.eos() x = x.transpose(1, 0) return vocab, x, torch.LongTensor(src_len)
def __init__(self, args, dictionary: Dictionary, expansion_dictionary: Dictionary): super().__init__(dictionary) self.expansion_dictionary = expansion_dictionary self.padding_idx = dictionary.pad() self.vocab_size = dictionary.__len__() self.expansion_vocab_size = expansion_dictionary.__len__() self.max_positions = args.max_positions self.use_head_embeddings = getattr(args, 'head_embeddings', False) assert args.num_segment == 0, "Number of segments must be 0" assert not args.no_token_positional_embeddings, "Positional embedding must be enabled" assert not args.sent_loss, "Sentence loss must be disabled" self.sentence_encoder = TransformerSentenceEncoder( padding_idx=self.padding_idx, vocab_size=self.vocab_size, num_encoder_layers=args.encoder_layers, embedding_dim=args.encoder_embed_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.act_dropout, max_seq_len=self.max_positions, num_segments=args.num_segment, use_position_embeddings=not args.no_token_positional_embeddings, encoder_normalize_before=args.encoder_normalize_before, apply_bert_init=args.apply_bert_init, activation_fn=args.activation_fn, learned_pos_embedding=args.encoder_learned_pos, add_bias_kv=args.bias_kv, add_zero_attn=args.zero_attn, use_head_embeddings=self.use_head_embeddings, condition_token_on_expansion=getattr( args, 'condition_token_on_expansion', False), expansion_layer=getattr(args, 'expansion_layer', -1), expansion_vocab_size=self.expansion_vocab_size, ) self.share_input_output_embed = args.share_encoder_input_output_embed self.embed_out = None self.lm_head_transform_weight = nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim) self.activation_fn = utils.get_activation_fn(args.activation_fn) self.layer_norm = LayerNorm(args.encoder_embed_dim) self.token_output_learned_bias = nn.Parameter( torch.zeros(self.vocab_size)) if not self.share_input_output_embed: self.embed_out = nn.Linear(args.encoder_embed_dim, self.vocab_size, bias=False)
def setup_task(cls, args, **kwargs): """Setup the task. """ dictionary = Dictionary() for i in range(args.dict_size): dictionary.add_symbol('word{}'.format(i)) logger.info('dictionary: {} types'.format(len(dictionary))) args.max_source_positions = args.src_len + dictionary.pad() + 2 args.max_target_positions = args.tgt_len + dictionary.pad() + 2 return cls(args, dictionary)
class DummyLMTask(FairseqTask): def __init__(self, cfg: DummyLMConfig): super().__init__(cfg) # load dictionary self.dictionary = Dictionary() for i in range(cfg.dict_size): self.dictionary.add_symbol("word{}".format(i)) self.dictionary.pad_to_multiple_(8) # often faster if divisible by 8 logger.info("dictionary: {} types".format(len(self.dictionary))) seq = torch.arange(cfg.tokens_per_sample + 1) + self.dictionary.pad() + 1 self.dummy_src = seq[:-1] self.dummy_tgt = seq[1:] def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ if self.cfg.batch_size is not None: bsz = self.cfg.batch_size else: bsz = max(1, self.cfg.max_tokens // self.cfg.tokens_per_sample) self.datasets[split] = DummyDataset( { "id": 1, "net_input": { "src_tokens": torch.stack([self.dummy_src for _ in range(bsz)]), "src_lengths": torch.full( (bsz, ), self.cfg.tokens_per_sample, dtype=torch.long), }, "target": torch.stack([self.dummy_tgt for _ in range(bsz)]), "nsentences": bsz, "ntokens": bsz * self.cfg.tokens_per_sample, }, num_items=self.cfg.dataset_size, item_size=self.cfg.tokens_per_sample, ) @property def source_dictionary(self): return self.dictionary @property def target_dictionary(self): return self.dictionary
def _convert_src_tokens_to_tensor( self, vocab: Dictionary, src_tokens: List[List[str]], append_eos: bool ): src_len = [len(x) for x in src_tokens] # If we have to append EOS, we include EOS in counting src length if append_eos: src_len = [length + 1 for length in src_len] x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad()) for i in range(len(src_tokens)): for j in range(len(src_tokens[i])): x[i][j] = vocab.index(src_tokens[i][j]) if append_eos: x[i][j + 1] = vocab.eos() x = x.transpose(1, 0) return x, torch.LongTensor(src_len)