def __init__(self, args, dictionary, left_pad=True, convolutions=((512, 3), ) * 20, stride=2, audio_features=40): super().__init__(dictionary) self.dropout = args.dropout self.max_source_positions = args.max_source_positions self.padding_idx = dictionary.pad() convolutions = eval( args.encoder_convolutions ) if args.encoder_convolutions is not None else convolutions convolutions = extend_conv_spec(convolutions) self.convolutions = nn.ModuleList() in_channels = 1 for i, (out_channels, kernel_size, kernel_width) in enumerate(convolutions): if kernel_size % 2 == 1: padding = kernel_size // 2 else: padding = 0 self.convolutions.append( Conv2D(in_channels, out_channels, kernel_size, dropout=self.dropout, padding=padding, stride=2)) in_channels = out_channels self.relu = nn.ReLU() # if args.attn_2d: # self.attn_2d = nn.ModuleList([ConvAttention2D(out_channels, 4, # dropout=self.dropout) for _ in range(2)]) self.bn = nn.ModuleList( [BatchNorm(out_channels) for _ in range(len(convolutions))]) flat_dim = math.ceil(math.ceil(audio_features / 2) / 2) * out_channels self.layers = nn.ModuleList([]) self.fc3 = Linear(flat_dim, args.encoder_embed_dim) self.layers.extend([ TransformerEncoderLayer(args) for _ in range(args.encoder_layers) ]) self.embed_positions = PositionalEmbeddingAudio( args.max_source_positions, args.encoder_embed_dim, self.padding_idx, left_pad=left_pad, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.register_buffer('version', torch.Tensor([2])) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(args.encoder_embed_dim)
def __init__(self, args, dictionary, left_pad=True, convolutions=((512, 3),) * 20, stride=2, audio_features=40, ): super().__init__(dictionary) self.dropout = args.dropout embed_dim = args.encoder_embed_dim self.max_source_positions = args.max_source_positions self.padding_idx = dictionary.pad() convolutions = eval(args.encoder_convolutions) if args.encoder_convolutions is not None else convolutions convolutions = extend_conv_spec(convolutions) self.convolutions = nn.ModuleList() in_channels = 1 for i, (out_channels, kernel_size, kernel_width) in enumerate(convolutions): if kernel_size % 2 == 1: padding = kernel_size // 2 else: padding = 0 self.convolutions.append( Conv2D(in_channels, out_channels, kernel_size, dropout=self.dropout, padding=padding, stride=2) ) in_channels = out_channels self.relu = nn.ReLU() self.fc1 = Linear(audio_features, 2*embed_dim) self.fc2 = Linear(2*embed_dim, embed_dim) self.embed_scale = math.sqrt(embed_dim) args.encoder_dim = embed_dim * (in_channels // (2 ** len(convolutions))) // 2 self.layers = nn.ModuleList([]) encoder_embed_dim = args.encoder_embed_dim args.encoder_embed_dim = args.encoder_dim self.fc3 = Linear(args.encoder_dim*2, embed_dim*2) self.layers.extend([ TransformerEncoderLayer(args) for _ in range(args.encoder_layers) ]) self.embed_positions = PositionalEmbeddingAudio( args.max_source_positions, args.encoder_dim, 0, left_pad=left_pad, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None args.encoder_embed_dim = encoder_embed_dim self.register_buffer('version', torch.Tensor([2])) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(args.encoder_dim)
def PositionalEmbeddingAudioLayer(num_embeddings, embedding_dim, padding_idx, left_pad, learned=True): m = PositionalEmbeddingAudio(num_embeddings, embedding_dim, padding_idx, left_pad, learned=learned) if learned: nn.init.normal_(m.weight, 0, 0.1) nn.init.constant_(m.weight[padding_idx], 0) return m