def __init__(self, *, config: Dict[str, Any]): """ Initialize AttentionLSTMDecoder. :param config: Configuration of AttentionLSTMDecoder. Example: { "hidden_dimension": 300, "embedding_dimension": 300, "vocabulary_size": 100000, "encoder_output_dimension": 2048, "number_of_lstm_layers": 1, "unk_token_id": 0, "bos_token_id": 1, "eos_token_id": 2, "language": "en" } """ super().__init__() self.config = config self.tokenizer = bpemb.BPEmb(lang=self.config["language"], vs=self.config["vocabulary_size"], dim=self.config["embedding_dimension"]) self.embedding = torch.nn.Embedding.from_pretrained( torch.tensor(self.tokenizer.vectors)) self.lstm = LSTM( input_size=self.config["embedding_dimension"], hidden_size=self.config["hidden_dimension"], num_layers=self.config["number_of_lstm_layers"], vocabulary_size=self.config["vocabulary_size"], context_vector_size=self.config["encoder_output_dimension"]) self.hidden_initializer = torch.nn.Linear( in_features=self.config["encoder_output_dimension"], out_features=self.config["hidden_dimension"]) self.cell_initializer = torch.nn.Linear( in_features=self.config["encoder_output_dimension"], out_features=self.config["hidden_dimension"]) self.energy_function = torch.nn.Linear( in_features=(self.config["encoder_output_dimension"] + self.config["hidden_dimension"]), out_features=1) self.style_embedding = torch.nn.Embedding( num_embeddings=3, embedding_dim=self.config["embedding_dimension"]) self.initialize_parameters()
@log_init def __init__(self, hdf5_path: Path, raw_data_group_names: Dict[str, str], language: str = "en", vocabulary_size: int = 100_000, embedding_dimensionality: int = 300, force_update: bool = False): self.logger = logging.getLogger(self.__class__.__name__) self.hdf5_path = hdf5_path self.raw_data_group_names = raw_data_group_names self.language = language self.vocabulary_size = vocabulary_size self.embedding_dimensionality = embedding_dimensionality self.tokenizer = bpemb.BPEmb(lang=self.language, vs=self.vocabulary_size, dim=self.embedding_dimensionality) self.force_update = force_update @log_run def run(self) -> None: if self.cache_exists() and not self.force_update: self.logger.info( "Cached version of tokenized data already exists. " + "Skipping tokenization.") return None for hdf5_group_name in self.raw_data_group_names.values(): self.tokenize( input_hdf5_group=hdf5_group_name, input_hdf5_dataset="caption_cleaned",
def __init__(self, dim, vocab_size, lang='en'): self.bpemb = bpemb.BPEmb(lang=lang, dim=dim, vs=vocab_size) self.dim = dim self.vectors = torch.from_numpy(self.bpemb.vectors)