Exemplo n.º 1
0
    def __init__(
        self,
        dictionary,
        char_embed_dim=32,
        word_embed_dim=512,
        convolutions=((128, 3), (128, 5)),
        dropout=0.1,
        num_highway_layers=0,
        preserve_word=True,
    ):
        super().__init__()
        self.dictionary = dictionary
        vocab_size = len(self.dictionary)
        self.embed_char_tokens = nn.Embedding(vocab_size, char_embed_dim)
        in_channels = convolutions[0][0]
        self.dropout = dropout
        self.convolutions = nn.ModuleList([
            ConvTBC(in_channels, out_channels * 2, kernel_size)
            for (out_channels, kernel_size) in convolutions
        ])

        self.fc_input = common_layers.Linear(char_embed_dim, in_channels)
        conv_output_dim = sum(out_dim for (out_dim, _) in convolutions)
        self.fc_output = common_layers.Linear(conv_output_dim, word_embed_dim)

        self.highway_layers = nn.ModuleList([HighwayLayer(conv_output_dim)] *
                                            num_highway_layers)

        self.preserve_word = preserve_word
Exemplo n.º 2
0
    def __init__(self,
                 dictionary,
                 embed_dim=512,
                 max_positions=1024,
                 convolutions=((512, 3), ) * 20,
                 dropout=0.1):
        super().__init__(dictionary)
        self.dropout = dropout
        self.num_attention_layers = None

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        self.embed_positions = PositionalEmbedding(
            max_positions,
            embed_dim,
            padding_idx,
            left_pad=LanguagePairDataset.LEFT_PAD_SOURCE,
        )

        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        for (out_channels, kernel_size) in convolutions:
            self.projections.append(
                Linear(in_channels, out_channels
                       ) if in_channels != out_channels else None)
            self.convolutions.append(
                ConvTBC(in_channels,
                        out_channels * 2,
                        kernel_size,
                        dropout=dropout))
            in_channels = out_channels
        self.fc2 = Linear(in_channels, embed_dim)
Exemplo n.º 3
0
    def __init__(self,
                 num_embeddings,
                 embed_dim=512,
                 max_positions=1024,
                 convolutions=((512, 3), ) * 20,
                 dropout=0.1,
                 padding_idx=1):
        super(Encoder, self).__init__()
        self.dropout = dropout
        self.num_attention_layers = None
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        self.embed_positions = Embedding(max_positions, embed_dim, padding_idx)

        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        for (out_channels, kernel_size) in convolutions:
            pad = (kernel_size - 1) // 2
            self.projections.append(
                Linear(in_channels, out_channels
                       ) if in_channels != out_channels else None)
            self.convolutions.append(
                ConvTBC(in_channels,
                        out_channels * 2,
                        kernel_size,
                        padding=pad,
                        dropout=dropout))
            in_channels = out_channels
        self.fc2 = Linear(in_channels, embed_dim)
Exemplo n.º 4
0
    def test_convtbc(self):
        # ksz, in_channels, out_channels
        conv_tbc = ConvTBC(4, 5, kernel_size=3, padding=1)
        # out_channels, in_channels, ksz
        conv1d = nn.Conv1d(4, 5, kernel_size=3, padding=1)

        conv_tbc.weight.data.copy_(conv1d.weight.data.transpose(0, 2))
        conv_tbc.bias.data.copy_(conv1d.bias.data)

        input_tbc = torch.randn(7, 2, 4, requires_grad=True)
        input1d = input_tbc.data.transpose(0, 1).transpose(1, 2)
        input1d.requires_grad = True

        output_tbc = conv_tbc(input_tbc)
        output1d = conv1d(input1d)

        self.assertAlmostEqual(
            output_tbc.data.transpose(0, 1).transpose(1, 2), output1d.data)

        grad_tbc = torch.randn(output_tbc.size())
        grad1d = grad_tbc.transpose(0, 1).transpose(1, 2).contiguous()

        output_tbc.backward(grad_tbc)
        output1d.backward(grad1d)

        self.assertAlmostEqual(conv_tbc.weight.grad.data.transpose(0, 2),
                               conv1d.weight.grad.data)
        self.assertAlmostEqual(conv_tbc.bias.grad.data, conv1d.bias.grad.data)
        self.assertAlmostEqual(
            input_tbc.grad.data.transpose(0, 1).transpose(1, 2),
            input1d.grad.data)
Exemplo n.º 5
0
    def __init__(
        self,
        dictionary,
        embed_dim=512,
        embed_dict=None,
        max_positions=1024,
        convolutions=((512, 3), ) * 20,
        dropout=0.1,
    ):
        super().__init__(dictionary)
        self.dropout = dropout
        self.num_attention_layers = None

        num_embeddings = len(dictionary)
        self.padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim,
                                      self.padding_idx)
        if embed_dict:
            self.embed_tokens = utils.load_embedding(embed_dict,
                                                     self.dictionary,
                                                     self.embed_tokens)

        self.embed_positions = PositionalEmbedding(
            max_positions,
            embed_dim,
            self.padding_idx,
        )

        convolutions = extend_conv_spec(convolutions)
        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.residuals = []

        layer_in_channels = [in_channels]
        for _, (out_channels, kernel_size,
                residual) in enumerate(convolutions):
            if residual == 0:
                residual_dim = out_channels
            else:
                residual_dim = layer_in_channels[-residual]
            self.projections.append(
                Linear(residual_dim, out_channels
                       ) if residual_dim != out_channels else None)
            if kernel_size % 2 == 1:
                padding = kernel_size // 2
            else:
                padding = 0
            self.convolutions.append(
                ConvTBC(in_channels,
                        out_channels * 2,
                        kernel_size,
                        dropout=dropout,
                        padding=padding))
            self.residuals.append(residual)
            in_channels = out_channels
            layer_in_channels.append(out_channels)
        self.fc2 = Linear(in_channels, embed_dim)
Exemplo n.º 6
0
def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
    """Weight-normalized Conv1d layer"""
    from fairseq.modules import ConvTBC
    m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
    nn.init.normal_(m.weight, mean=0, std=std)
    nn.init.constant_(m.bias, 0)
    return nn.utils.weight_norm(m, dim=2)
Exemplo n.º 7
0
def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
    """Weight-normalized Conv1d layer"""
    from fairseq.modules import ConvTBC
    m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
    m.weight.data.normal_(mean=0, std=std)
    m.bias.data.zero_()
    return m
Exemplo n.º 8
0
    def __init__(
        self,
        dictionary,
        embed_dim=512,
        max_positions=1024,
        convolutions=((512, 3), ) * 20,
        dropout=0.1,
        attention=False,
        attention_nheads=1,
        left_pad=True,
    ):
        super().__init__(dictionary)
        self.dropout = dropout
        self.num_attention_layers = None
        self.left_pad = left_pad

        num_embeddings = len(dictionary)
        self.padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim,
                                      self.padding_idx)
        self.embed_positions = PositionalEmbedding(
            max_positions,
            embed_dim,
            self.padding_idx,
            left_pad=self.left_pad,
        )

        def expand_bool_array(val):
            if isinstance(val, bool):
                # expand True into [True, True, ...] and do the same with False
                return [val] * len(convolutions)
            return val

        attention = expand_bool_array(attention)

        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.attention = nn.ModuleList()
        self.attproj = nn.ModuleList()
        for i, (out_channels, kernel_size) in enumerate(convolutions):
            self.projections.append(
                Linear(in_channels, out_channels
                       ) if in_channels != out_channels else None)
            self.convolutions.append(
                ConvTBC(in_channels,
                        out_channels * 2,
                        kernel_size,
                        dropout=dropout))

            self.attention.append(
                SelfAttention(out_channels, embed_dim, attention_nheads
                              ) if attention[i] else None)
            in_channels = out_channels

        self.fc2 = Linear(in_channels, embed_dim)
Exemplo n.º 9
0
def ConvTBC(in_channels,
            out_channels,
            kernel_size,
            dilation=(1, ),
            dropout=0,
            **kwargs):
    """Weight-normalized Conv1d layer"""
    from fairseq.modules import ConvTBC
    assert dilation[0] == 1
    m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
    m.weight.data.normal_(mean=0, std=std)
    m.bias.data.zero_()
    return nn.utils.weight_norm(m, dim=2)
Exemplo n.º 10
0
    def __init__(self, dictionary, embed_dim=512, max_positions=1024,
                 convolutions=((512, 3),) * 20, dropout=0.1):
        super().__init__(dictionary)
        embed_dim = vector_dict.embedding_dim
        convolutions=((vector_dict.embedding_dim, 3),) * 20
        self.dropout = dropout
        self.num_attention_layers = None
        self.embed_dim = embed_dim
        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = nn.Embedding.from_pretrained(torch.FloatTensor(vector_dict.embedding[:,:vector_dict.embedding_dim]), freeze=False)  # load pre-trained vector
        #self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        #self.embed_tokens.weight.data.copy_(torch.from_numpy(vector_dict.embedding))
        #self.embed_tokens.weight.requires_grad = True
        self.embed_positions = PositionalEmbedding(
            max_positions,
            embed_dim,
            padding_idx,
            left_pad=LanguagePairDataset.LEFT_PAD_SOURCE,
        )

        in_channels = convolutions[0][0]
        # Shashi
        self.fc1 = Linear(embed_dim+512, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        for (out_channels, kernel_size) in convolutions:
            self.projections.append(Linear(in_channels, out_channels)
                                    if in_channels != out_channels else None)
            self.convolutions.append(
                ConvTBC(in_channels, out_channels * 2, kernel_size,
                        dropout=dropout)
            )
            in_channels = out_channels
        self.fc2 = Linear(in_channels, embed_dim+512)
        self.lay_norm = nn.LayerNorm(embed_dim)  # layer nomalization in NGTU
Exemplo n.º 11
0
    def __init__(self,
                 dictionary,
                 embed_dim=512,
                 max_positions=1024,
                 convolutions=((512, 3), ) * 20,
                 dropout=0.1):  # 512*3 is a filter, size=3, dim vec = 512
        super().__init__()
        self.dictionary = dictionary
        self.dropout = dropout
        self.num_attention_layers = None

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        self.embed_positions = Embedding(max_positions, embed_dim, padding_idx)

        in_channels = convolutions[0][0]
        self.fc1 = Linear(
            embed_dim, in_channels, dropout=dropout
        )  # is word vec dim is not consitent with filter dim[channel]
        # because text is call 1d, so width is named channel
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        for (out_channels, kernel_size) in convolutions:
            pad = (kernel_size - 1) // 2
            self.projections.append(
                Linear(in_channels, out_channels
                       ) if in_channels != out_channels else None)
            self.convolutions.append(
                ConvTBC(in_channels,
                        out_channels * 2,
                        kernel_size,
                        padding=pad,
                        dropout=dropout))
            in_channels = out_channels
        self.fc2 = Linear(in_channels, embed_dim)
Exemplo n.º 12
0
    def __init__(self,
                 dictionary,
                 embed_dim=512,
                 embed_dict=None,
                 max_positions=1024,
                 convolutions=((512, 3), ) * 20,
                 dropout=0.1,
                 batch_norm=False,
                 use_linear_se=False):
        super().__init__(dictionary)
        self.dropout = dropout
        self.num_attention_layers = None
        self.batch_norm = batch_norm

        num_embeddings = len(dictionary)
        self.padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim,
                                      self.padding_idx)
        if embed_dict:
            self.embed_tokens = utils.load_embedding(embed_dict,
                                                     self.dictionary,
                                                     self.embed_tokens)

        self.embed_positions = PositionalEmbedding(
            max_positions,
            embed_dim,
            self.padding_idx,
        )

        convolutions = extend_conv_spec_extended(convolutions)
        in_channels = convolutions[0][0]
        if use_linear_se:
            self.fc1 = LinearSE(embed_dim, in_channels, dropout=dropout)
        else:
            self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.inner_convolutions = nn.ModuleList()
        #self.se_layers = nn.ModuleList()
        self.residuals = []
        self.kernel_sizes = 0

        layer_in_channels = [in_channels]
        for idx, (out_channels, kernel_sizes,
                  residual) in enumerate(convolutions):
            self.kernel_sizes = len(kernel_sizes)
            self.inner_convolutions.append(nn.ModuleList())
            if residual == 0:
                residual_dim = out_channels
            else:
                residual_dim = layer_in_channels[-residual]
            if use_linear_se:
                self.projections.append(
                    LinearSE(residual_dim, out_channels
                             ) if residual_dim != out_channels else None)
            else:
                self.projections.append(
                    Linear(residual_dim, out_channels
                           ) if residual_dim != out_channels else None)
            for kernel_size in kernel_sizes:
                if kernel_size % 2 == 1:
                    padding = kernel_size // 2
                else:
                    padding = 0
                self.inner_convolutions[idx].append(
                    ConvTBC(in_channels,
                            out_channels * 2,
                            kernel_size,
                            dropout=dropout,
                            padding=padding))
            # TODO(naetherm): Combine the outputs of the convolution to one single instance max_pooling
            #self.convolutions.append(torch.stack(self.inner_convolutions[idx], dim=0).sum(dim=0))
            #self.se_layers.append(SqueezeExcitationLayer(n_features=16))
            self.residuals.append(residual)
            in_channels = out_channels
            layer_in_channels.append(out_channels)
        self.mp2d = torch.nn.MaxPool2d(kernel_size=(self.kernel_sizes, 1))
        if use_linear_se:
            self.fc2 = LinearSE(in_channels, embed_dim)
        else:
            self.fc2 = Linear(in_channels, embed_dim)
Exemplo n.º 13
0
    def __init__(self,
                 dictionary,
                 args,
                 encoder_embed_dim=512,
                 embed_dict=None,
                 max_positions=1024,
                 convolutions=((512, 3), ) * 20,
                 dropout=0.1,
                 left_pad=True):
        super().__init__(dictionary)
        self.elmo = Elmo(options_file,
                         weight_file,
                         args.num_output_repr,
                         dropout=args.elmo_dropout,
                         do_layer_norm=args.elmo_do_layer_norm)
        self.args = args
        if self.args.merge_mode == 'sum':
            # just use in `sum` mode
            self.elmo_projection = Linear(args.elmo_repr_dim,
                                          encoder_embed_dim)
        self.id2token = {v: k for k, v in dictionary.indices.items()}
        self.dropout = dropout
        self.left_pad = left_pad
        self.num_attention_layers = None

        num_embeddings = len(dictionary)
        self.padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, args.token_embed_dim,
                                      self.padding_idx)
        if embed_dict:
            self.embed_tokens = utils.load_embedding(embed_dict,
                                                     self.dictionary,
                                                     self.embed_tokens)

        self.embed_positions = PositionalEmbedding(
            max_positions,
            args.token_embed_dim,
            self.padding_idx,
            left_pad=self.left_pad,
        )

        convolutions = extend_conv_spec(convolutions)
        in_channels = convolutions[0][0]
        self.fc1 = Linear(encoder_embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.residuals = []

        layer_in_channels = [in_channels]
        for _, (out_channels, kernel_size,
                residual) in enumerate(convolutions):
            if residual == 0:
                residual_dim = out_channels
            else:
                residual_dim = layer_in_channels[-residual]
            self.projections.append(
                Linear(residual_dim, out_channels
                       ) if residual_dim != out_channels else None)
            if kernel_size % 2 == 1:
                padding = kernel_size // 2
            else:
                padding = 0
            self.convolutions.append(
                ConvTBC(in_channels,
                        out_channels * 2,
                        kernel_size,
                        dropout=dropout,
                        padding=padding))
            self.residuals.append(residual)
            in_channels = out_channels
            layer_in_channels.append(out_channels)
        if args.num_output_repr == 2 and args.merge_mode == 'concat':
            self.fc2 = Linear(in_channels + args.elmo_repr_dim,
                              encoder_embed_dim)
        else:
            self.fc2 = Linear(in_channels, encoder_embed_dim)