def __init__(self, vocab: Vocabulary): super().__init__() self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(-1), bos_idx=vocab.get_bos_index(-1), eos_idx=vocab.get_eos_index(-1), unk_idx=vocab.get_unk_index(-1), )
def __init__(self, tokenizer: Tokenizer, vocab: Vocabulary, max_seq_len: int): super().__init__() self.tokenizer = tokenizer self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(), bos_idx=vocab.get_bos_index(-1), eos_idx=vocab.get_eos_index(-1), unk_idx=vocab.get_unk_index(), ) self.vocab_lookup = VocabLookup(self.vocab) self.max_seq_len = max_seq_len
def __init__( self, pretrained_embeddings_path: str, vocab: Vocabulary, embedding_dim: int, mlp_layer_dims: Optional[Sequence[int]] = None, lowercase_tokens: bool = False, skip_header: bool = True, delimiter: str = " ", ) -> None: super().__init__() pretrained_embedding = PretrainedEmbedding( pretrained_embeddings_path, lowercase_tokens=lowercase_tokens, skip_header=skip_header, delimiter=delimiter, ) embeddings_weight = pretrained_embedding.initialize_embeddings_weights( vocab.idx, # tensorizer.vocab.idx, vocab.unk_token, # tensorizer.vocab.unk_token, embedding_dim, EmbedInitStrategy.RANDOM, ) num_embeddings = len(vocab.idx) self.embedding = nn.Embedding( num_embeddings, embedding_dim, _weight=embeddings_weight, padding_idx=vocab.get_pad_index(), ) # Initialize unk embedding with zeros # to guard the model against randomized decisions based on unknown words unk_token_idx = vocab.get_unk_index() if unk_token_idx >= 0: self.embedding.weight.data[unk_token_idx].fill_(0.0) # Create MLP layers if mlp_layer_dims is None: mlp_layer_dims = [] self.mlp = nn.Sequential( *(nn.Sequential(nn.Linear(m, n), nn.ReLU()) for m, n in zip([embedding_dim] + list(mlp_layer_dims), mlp_layer_dims))) self.output_dim = mlp_layer_dims[ -1] if mlp_layer_dims else embedding_dim