def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.1) -> None: super(StackedSelfAttentionEncoder, self).__init__() self._use_positional_encoding = use_positional_encoding self._attention_layers: List[MultiHeadSelfAttention] = [] self._feedfoward_layers: List[FeedForward] = [] self._layer_norm_layers: List[LayerNorm] = [] self._feed_forward_layer_norm_layers: List[LayerNorm] = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward( feedfoward_input_dim, activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) # Note: Please use `ModuleList` in new code. It provides better # support for running on multiple GPUs. We've kept `add_module` here # solely for backwards compatibility with existing serialized models. self.add_module(f"feedforward_{i}", feedfoward) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim()) self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim, attention_dropout_prob=attention_dropout_prob) self.add_module(f"self_attention_{i}", self_attention) self._attention_layers.append(self_attention) layer_norm = LayerNorm(self_attention.get_output_dim()) self.add_module(f"layer_norm_{i}", layer_norm) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = Dropout(residual_dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim()
def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.1) -> None: super(StackedSelfAttentionEncoder, self).__init__() self._use_positional_encoding = use_positional_encoding self._attention_layers: List[MultiHeadSelfAttention] = [] self._feedfoward_layers: List[FeedForward] = [] self._layer_norm_layers: List[LayerNorm] = [] self._feed_forward_layer_norm_layers: List[LayerNorm] = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward( feedfoward_input_dim, activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) self.add_module(f"feedforward_{i}", feedfoward) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim()) self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim, attention_dropout_prob=attention_dropout_prob) self.add_module(f"self_attention_{i}", self_attention) self._attention_layers.append(self_attention) layer_norm = LayerNorm(self_attention.get_output_dim()) self.add_module(f"layer_norm_{i}", layer_norm) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = Dropout(residual_dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim()
def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.1) -> None: super(StackedSelfAttentionEncoder, self).__init__() self._use_positional_encoding = use_positional_encoding self._attention_layers: List[MultiHeadSelfAttention] = [] self._feedfoward_layers: List[FeedForward] = [] self._layer_norm_layers: List[LayerNorm] = [] self._feed_forward_layer_norm_layers: List[LayerNorm] = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward(feedfoward_input_dim, activations=[Activation.by_name('relu')(), Activation.by_name('linear')()], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) self.add_module(f"feedforward_{i}", feedfoward) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim()) self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MultiHeadSelfAttention(num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim, attention_dropout_prob=attention_dropout_prob) self.add_module(f"self_attention_{i}", self_attention) self._attention_layers.append(self_attention) layer_norm = LayerNorm(self_attention.get_output_dim()) self.add_module(f"layer_norm_{i}", layer_norm) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = Dropout(residual_dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim()
def __init__( self, input_dim: int, hidden_dim: int, attention_projection_dim: int, feedforward_hidden_dim: int, num_convs: int, conv_kernel_size: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.1, layer_dropout_undecayed_prob: float = 0.1, attention_dropout_prob: float = 0, ) -> None: super().__init__() check_dimensions_match(input_dim, hidden_dim, "input_dim", "hidden_dim") self._use_positional_encoding = use_positional_encoding self._conv_norm_layers = torch.nn.ModuleList( [LayerNorm(hidden_dim) for _ in range(num_convs)] ) self._conv_layers = torch.nn.ModuleList() for _ in range(num_convs): padding = torch.nn.ConstantPad1d( (conv_kernel_size // 2, (conv_kernel_size - 1) // 2), 0 ) depthwise_conv = torch.nn.Conv1d( hidden_dim, hidden_dim, conv_kernel_size, groups=hidden_dim ) pointwise_conv = torch.nn.Conv1d(hidden_dim, hidden_dim, 1) self._conv_layers.append( torch.nn.Sequential( padding, depthwise_conv, pointwise_conv, Activation.by_name("relu")() ) ) self.attention_norm_layer = LayerNorm(hidden_dim) self.attention_layer = MultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=attention_projection_dim, values_dim=attention_projection_dim, attention_dropout_prob=attention_dropout_prob, ) self.feedforward_norm_layer = LayerNorm(hidden_dim) self.feedforward = FeedForward( hidden_dim, activations=[Activation.by_name("relu")(), Activation.by_name("linear")()], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob, ) self.dropout = Dropout(dropout_prob) self.residual_with_layer_dropout = ResidualWithLayerDropout(layer_dropout_undecayed_prob) self._input_dim = input_dim self._output_dim = hidden_dim
def __init__(self, input_dim: int, dropout_prob: float = 0.0) -> None: super(MultiHeadAttentionEncoder, self).__init__() self._self_attention = MultiHeadSelfAttention(1, input_dim, input_dim, input_dim, 1) self._dropout = torch.nn.Dropout(dropout_prob) self._output_dim = input_dim self.input_dim = input_dim
def __init__(self, input_field_name: str, output_dim: int, embeddings: Embeddings, dropout: float, use_cuda: bool): super(LstmTokenEmbedder, self).__init__(input_field_name) self.embeddings = embeddings self.output_dim = output_dim self.use_cuda = use_cuda self.encoder_ = torch.nn.LSTM(embeddings.get_embed_dim(), embeddings.get_embed_dim(), num_layers=1, bidirectional=False, batch_first=True, dropout=dropout) self.attention = MultiHeadSelfAttention( num_heads=1, input_dim=embeddings.get_embed_dim(), attention_dim=embeddings.get_embed_dim(), values_dim=embeddings.get_embed_dim(), attention_dropout_prob=dropout)
def from_params( cls, vocab: Vocabulary, params: Params ) -> 'DialogueContextHierarchicalCoherenceAttentionClassifier': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params( vocab, embedder_params) utterance_encoder = Seq2VecEncoder.from_params( params.pop("utterance_encoder")) attend_feedforward = FeedForward.from_params( params.pop('attend_feedforward')) #similarity_function = SimilarityFunction.from_params(params.pop("similarity_function")) compare_feedforward = FeedForward.from_params( params.pop('compare_feedforward')) classifier_feedforward = FeedForward.from_params( params.pop("classifier_feedforward")) final_classifier_feedforward = FeedForward.from_params( params.pop("final_classifier_feedforward")) initializer = InitializerApplicator.from_params( params.pop("initializer", [])) regularizer = RegularizerApplicator.from_params( params.pop("regularizer", [])) #matrix_attention = MatrixAttention().from_params(params.pop("similarity_function")) matrix_attention = MultiHeadSelfAttention.from_params( params.pop("similarity_function")) return cls(vocab=vocab, text_field_embedder=text_field_embedder, attend_feedforward=attend_feedforward, matrix_attention=matrix_attention, compare_feedforward=compare_feedforward, classifier_feedforward=classifier_feedforward, final_classifier_feedforward=final_classifier_feedforward, utterance_encoder=utterance_encoder, initializer=initializer, regularizer=regularizer)