예제 #1
0
    def __init__(self,
                 args,
                 dictionary,
                 left_pad=True,
                 convolutions=((512, 3), ) * 20,
                 stride=2,
                 audio_features=40):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.max_source_positions = args.max_source_positions

        self.padding_idx = dictionary.pad()

        convolutions = eval(
            args.encoder_convolutions
        ) if args.encoder_convolutions is not None else convolutions

        convolutions = extend_conv_spec(convolutions)
        self.convolutions = nn.ModuleList()
        in_channels = 1
        for i, (out_channels, kernel_size,
                kernel_width) in enumerate(convolutions):
            if kernel_size % 2 == 1:
                padding = kernel_size // 2
            else:
                padding = 0
            self.convolutions.append(
                Conv2D(in_channels,
                       out_channels,
                       kernel_size,
                       dropout=self.dropout,
                       padding=padding,
                       stride=2))
            in_channels = out_channels
        self.relu = nn.ReLU()
        # if args.attn_2d:
        #     self.attn_2d = nn.ModuleList([ConvAttention2D(out_channels, 4,
        #                                                   dropout=self.dropout) for _ in range(2)])
        self.bn = nn.ModuleList(
            [BatchNorm(out_channels) for _ in range(len(convolutions))])

        flat_dim = math.ceil(math.ceil(audio_features / 2) / 2) * out_channels
        self.layers = nn.ModuleList([])

        self.fc3 = Linear(flat_dim, args.encoder_embed_dim)
        self.layers.extend([
            TransformerEncoderLayer(args) for _ in range(args.encoder_layers)
        ])
        self.embed_positions = PositionalEmbeddingAudio(
            args.max_source_positions,
            args.encoder_embed_dim,
            self.padding_idx,
            left_pad=left_pad,
            learned=args.encoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.encoder_normalize_before
        if self.normalize:
            self.layer_norm = LayerNorm(args.encoder_embed_dim)
예제 #2
0
    def __init__(self, args, dictionary, left_pad=True, convolutions=((512, 3),) * 20, stride=2,
                 audio_features=40, ):
        super().__init__(dictionary)
        self.dropout = args.dropout
        embed_dim = args.encoder_embed_dim
        self.max_source_positions = args.max_source_positions

        self.padding_idx = dictionary.pad()

        convolutions = eval(args.encoder_convolutions) if args.encoder_convolutions is not None else convolutions

        convolutions = extend_conv_spec(convolutions)
        self.convolutions = nn.ModuleList()
        in_channels = 1
        for i, (out_channels, kernel_size, kernel_width) in enumerate(convolutions):
            if kernel_size % 2 == 1:
                padding = kernel_size // 2
            else:
                padding = 0
            self.convolutions.append(
                        Conv2D(in_channels, out_channels, kernel_size,
                            dropout=self.dropout, padding=padding, stride=2)
            )
            in_channels = out_channels
        self.relu = nn.ReLU()

        self.fc1 = Linear(audio_features, 2*embed_dim)
        self.fc2 = Linear(2*embed_dim, embed_dim)
        self.embed_scale = math.sqrt(embed_dim)

        args.encoder_dim = embed_dim * (in_channels // (2 ** len(convolutions))) // 2

        self.layers = nn.ModuleList([])

        encoder_embed_dim = args.encoder_embed_dim
        args.encoder_embed_dim = args.encoder_dim
        self.fc3 = Linear(args.encoder_dim*2, embed_dim*2)
        self.layers.extend([
            TransformerEncoderLayer(args)
            for _ in range(args.encoder_layers)
        ])
        self.embed_positions = PositionalEmbeddingAudio(
            args.max_source_positions, args.encoder_dim, 0,
            left_pad=left_pad,
            learned=args.encoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None
        args.encoder_embed_dim = encoder_embed_dim
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.encoder_normalize_before
        if self.normalize:
           self.layer_norm = LayerNorm(args.encoder_dim)
예제 #3
0
def PositionalEmbeddingAudioLayer(num_embeddings,
                                  embedding_dim,
                                  padding_idx,
                                  left_pad,
                                  learned=True):
    m = PositionalEmbeddingAudio(num_embeddings,
                                 embedding_dim,
                                 padding_idx,
                                 left_pad,
                                 learned=learned)
    if learned:
        nn.init.normal_(m.weight, 0, 0.1)
        nn.init.constant_(m.weight[padding_idx], 0)
    return m