Пример #1
0
    def __init__(self, *, config: Dict[str, Any]):
        """
        Initialize AttentionLSTMDecoder.

        :param config: Configuration of AttentionLSTMDecoder.
          Example:
          {
            "hidden_dimension": 300,
            "embedding_dimension": 300,
            "vocabulary_size": 100000,
            "encoder_output_dimension": 2048,
            "number_of_lstm_layers": 1,
            "unk_token_id": 0,
            "bos_token_id": 1,
            "eos_token_id": 2,
            "language": "en"
          }
        """
        super().__init__()
        self.config = config

        self.tokenizer = bpemb.BPEmb(lang=self.config["language"],
                                     vs=self.config["vocabulary_size"],
                                     dim=self.config["embedding_dimension"])
        self.embedding = torch.nn.Embedding.from_pretrained(
            torch.tensor(self.tokenizer.vectors))
        self.lstm = LSTM(
            input_size=self.config["embedding_dimension"],
            hidden_size=self.config["hidden_dimension"],
            num_layers=self.config["number_of_lstm_layers"],
            vocabulary_size=self.config["vocabulary_size"],
            context_vector_size=self.config["encoder_output_dimension"])
        self.hidden_initializer = torch.nn.Linear(
            in_features=self.config["encoder_output_dimension"],
            out_features=self.config["hidden_dimension"])
        self.cell_initializer = torch.nn.Linear(
            in_features=self.config["encoder_output_dimension"],
            out_features=self.config["hidden_dimension"])
        self.energy_function = torch.nn.Linear(
            in_features=(self.config["encoder_output_dimension"] +
                         self.config["hidden_dimension"]),
            out_features=1)
        self.style_embedding = torch.nn.Embedding(
            num_embeddings=3, embedding_dim=self.config["embedding_dimension"])

        self.initialize_parameters()
Пример #2
0
    @log_init
    def __init__(self,
                 hdf5_path: Path,
                 raw_data_group_names: Dict[str, str],
                 language: str = "en",
                 vocabulary_size: int = 100_000,
                 embedding_dimensionality: int = 300,
                 force_update: bool = False):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.hdf5_path = hdf5_path
        self.raw_data_group_names = raw_data_group_names
        self.language = language
        self.vocabulary_size = vocabulary_size
        self.embedding_dimensionality = embedding_dimensionality
        self.tokenizer = bpemb.BPEmb(lang=self.language,
                                     vs=self.vocabulary_size,
                                     dim=self.embedding_dimensionality)
        self.force_update = force_update

    @log_run
    def run(self) -> None:
        if self.cache_exists() and not self.force_update:
            self.logger.info(
                "Cached version of tokenized data already exists. " +
                "Skipping tokenization.")
            return None

        for hdf5_group_name in self.raw_data_group_names.values():
            self.tokenize(
                input_hdf5_group=hdf5_group_name,
                input_hdf5_dataset="caption_cleaned",
Пример #3
0
 def __init__(self, dim, vocab_size, lang='en'):
     self.bpemb = bpemb.BPEmb(lang=lang, dim=dim, vs=vocab_size)
     self.dim = dim
     self.vectors = torch.from_numpy(self.bpemb.vectors)