def setUpClass(cls): """ Avoid redundant, time-consuming, equivalent setups when testing across the different methods, that can use common instantiations. """ feedforward_layer = PositionWiseFeedForward( token_representation_dimension=REPRESENTATION_DIMENSION, feedforward_dimension=FEEDFORWARD_DIMENSION, dropout_prob=DROPOUT_PROB) multi_head_attention_later = MultiHeadAttention( n_attention_heads=N_ATTENTION_HEADS, token_representation_dimension=REPRESENTATION_DIMENSION, dropout_prob=DROPOUT_PROB) cls.layer = EncoderBlock(building_blocks=EncoderBlockBuildingBlocks( self_multi_head_attention_layer=deepcopy( multi_head_attention_later), fully_connected_layer=feedforward_layer), feature_dimension=REPRESENTATION_DIMENSION, dropout_prob=DROPOUT_PROB) cls.forward_propagation_kwargs = { 'src_features': torch_rand(size=(MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH, REPRESENTATION_DIMENSION), dtype=torch_float), 'src_mask': torch_rand(size=(MINI_BATCH_SIZE, 1, MAX_SEQUENCE_LENGTH), dtype=torch_float) } cls.expected_output_shapes = [(MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH, REPRESENTATION_DIMENSION)] cls.expected_output_dtypes = [torch_float]
def setUpClass(cls): """ Avoid redundant, time-consuming, equivalent setups when testing across the different methods, that can use common instantiations. """ cls.layer = PositionWiseFeedForward( token_representation_dimension=REPRESENTATION_DIMENSION, feedforward_dimension=FEEDFORWARD_DIMENSION, dropout_prob=DROPOUT_PROB) cls.forward_propagation_kwargs = { 'features': torch_rand(size=(MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH, REPRESENTATION_DIMENSION), dtype=torch_float) } cls.expected_output_shapes = [(MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH, REPRESENTATION_DIMENSION)] cls.expected_output_dtypes = [torch_float]
def setUpClass(cls): """ Avoid redundant, time-consuming, equivalent setups when testing across the different methods, that can use common instantiations. """ positional_encoding_layer = PositionalEncoding( token_representation_dimension=REPRESENTATION_DIMENSION, dropout_prob=DROPOUT_PROB, max_sequence_length=MAX_SEQUENCE_LENGTH) src_embedder = Sequential( Embedder(token_representation_dimension=REPRESENTATION_DIMENSION, vocabulary_dimension=SRC_VOCABULARY_DIMENSION), deepcopy(positional_encoding_layer)) tgt_embedder = Sequential( Embedder(token_representation_dimension=REPRESENTATION_DIMENSION, vocabulary_dimension=TGT_VOCABULARY_DIMENSION), deepcopy(positional_encoding_layer)) feedforward_layer = PositionWiseFeedForward( token_representation_dimension=REPRESENTATION_DIMENSION, feedforward_dimension=FEEDFORWARD_DIMENSION, dropout_prob=DROPOUT_PROB) multi_head_attention_later = MultiHeadAttention( n_attention_heads=N_ATTENTION_HEADS, token_representation_dimension=REPRESENTATION_DIMENSION, dropout_prob=DROPOUT_PROB) encoder = Encoder(base_block=EncoderBlock( building_blocks=EncoderBlockBuildingBlocks( self_multi_head_attention_layer=deepcopy( multi_head_attention_later), fully_connected_layer=feedforward_layer), feature_dimension=REPRESENTATION_DIMENSION, dropout_prob=DROPOUT_PROB), n_clones=N_ENCODER_BLOCKS) decoder = Decoder(base_block=DecoderBlock( building_blocks=DecoderBlockBuildingBlocks( self_multi_head_attention_layer=deepcopy( multi_head_attention_later), source_multi_head_attention_layer=deepcopy( multi_head_attention_later), fully_connected_layer=feedforward_layer), feature_dimension=REPRESENTATION_DIMENSION, dropout_prob=DROPOUT_PROB), n_clones=N_DECODER_BLOCKS) log_softmax_layer = LogSoftmax( token_representation_dimension=REPRESENTATION_DIMENSION, vocabulary_dimension=TGT_VOCABULARY_DIMENSION) building_blocks = Seq2SeqBuildingBlocks( encoder=encoder, decoder=decoder, src_embedder=src_embedder, tgt_embedder=tgt_embedder, log_softmax_layer=log_softmax_layer) cls.layer = Seq2Seq(building_blocks=building_blocks) cls.forward_propagation_kwargs = { 'src_tokens': torch_randint(low=0, high=SRC_VOCABULARY_DIMENSION, size=(MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH), dtype=torch_long), 'tgt_tokens': torch_randint(low=0, high=TGT_VOCABULARY_DIMENSION, size=(MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH - 1), dtype=torch_long), 'src_mask': torch_rand(size=(MINI_BATCH_SIZE, 1, MAX_SEQUENCE_LENGTH), dtype=torch_float), 'tgt_mask': torch_rand(size=(MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH - 1, MAX_SEQUENCE_LENGTH - 1), dtype=torch_float) } cls.expected_output_shapes = [ (MINI_BATCH_SIZE, MAX_SEQUENCE_LENGTH - 1, REPRESENTATION_DIMENSION) ] cls.expected_output_dtypes = [torch_float]
def _build_model_architecture(self) -> None: """ Initializing the Transformer model object instantiated with the architecture specified by the input hyperparameters, with newly initialized weights. """ # building the architecture: # instantiating (some of) the base layers/blocks of the architecture: positional_encoding_layer = PositionalEncoding( token_representation_dimension=self.representation_dimension, dropout_prob=self.dropout_prob, max_sequence_length=self.max_sequence_length) multi_head_attention_later = MultiHeadAttention( n_attention_heads=self.n_attention_heads, token_representation_dimension=self.representation_dimension, dropout_prob=self.dropout_prob) feedforward_layer = PositionWiseFeedForward( token_representation_dimension=self.representation_dimension, feedforward_dimension=self.feedforward_dimension, dropout_prob=self.dropout_prob) log_softmax_layer = LogSoftmax( token_representation_dimension=self.representation_dimension, vocabulary_dimension=self.tgt_vocabulary_dimension) # composing some of the base layers to build the more complex ones: src_embedder = Sequential( Embedder( token_representation_dimension=self.representation_dimension, vocabulary_dimension=self.src_vocabulary_dimension), deepcopy(positional_encoding_layer)) tgt_embedder = Sequential( Embedder( token_representation_dimension=self.representation_dimension, vocabulary_dimension=self.tgt_vocabulary_dimension), deepcopy(positional_encoding_layer)) base_encoder_block = EncoderBlock( building_blocks=EncoderBlockBuildingBlocks( self_multi_head_attention_layer=deepcopy( multi_head_attention_later), fully_connected_layer=deepcopy(feedforward_layer), ), feature_dimension=self.representation_dimension, dropout_prob=self.dropout_prob) encoder = Encoder(base_block=base_encoder_block, n_clones=self.n_encoder_blocks) base_decoder_block = DecoderBlock( building_blocks=DecoderBlockBuildingBlocks( self_multi_head_attention_layer=deepcopy( multi_head_attention_later), source_multi_head_attention_layer=deepcopy( multi_head_attention_later), fully_connected_layer=deepcopy(feedforward_layer)), feature_dimension=self.representation_dimension, dropout_prob=self.dropout_prob) decoder = Decoder(base_block=base_decoder_block, n_clones=self.n_decoder_blocks) # instantiating the whole seq2seq encoder-decoder model: building_blocks = Seq2SeqBuildingBlocks( encoder=encoder, decoder=decoder, src_embedder=src_embedder, tgt_embedder=tgt_embedder, log_softmax_layer=log_softmax_layer) model = Seq2Seq(building_blocks=building_blocks) # initializing the parameters: # for each layer's parameter set: for parameter in model.parameters(): # TODO: explain why: if parameter.dim() > 1: # parameters initialized following Xavier initialization: xavier_uniform_(parameter) self.model = model