def __init__(self, args, dictionary, audio_features=40): super().__init__(dictionary) convolutions = eval(args.encoder_convolutions) if args.encoder_convolutions is not None else ((512, 3),) * 2 stride = 2 self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, "activation_fn", "relu") ) convolutions = fconv.extend_conv_spec(convolutions) self.convolutions = nn.ModuleList() in_channels = 1 for i, (out_channels, kernel_size, kernel_width) in enumerate(convolutions): if kernel_size % 2 == 1: padding = kernel_size // 2 else: padding = 0 self.convolutions.append(Conv2D( in_channels, out_channels, kernel_size, dropout=self.dropout, padding=padding, stride=stride)) in_channels = out_channels if args.attn_2d: self.attn_2d = nn.ModuleList( [ConvAttention2D(out_channels, 4, dropout=self.dropout) for _ in range(2)]) self.bn = nn.ModuleList([BatchNorm(out_channels) for _ in range(len(convolutions))]) if args.distance_penalty == True: args.distance_penalty = 'log' flat_dim = audio_features for _ in range(len(self.convolutions)): flat_dim = math.ceil(flat_dim / stride) flat_dim *= out_channels self.fc3 = Linear(flat_dim, args.encoder_embed_dim) self.embed_positions = PositionalEmbeddingAudio( args.max_source_positions, args.encoder_embed_dim, 0, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.encoder_layerdrop = args.encoder_layerdrop self.layers = nn.ModuleList([]) self.layers.extend( [ConvTransformerEncoderLayer(args) for _ in range(args.encoder_layers)] ) self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(args.encoder_embed_dim) else: self.layernorm_embedding = None self.ctc_compress_out = args.ctc_compress_out if self.ctc_compress_out: self.ctc_fc = nn.Linear(args.encoder_embed_dim, len(dictionary)) assert args.criterion == "ctc_multi_loss" self.ctc_layer = args.ctc_encoder_layer self.ctc_compress_method = getattr(CTCCompressStrategy, args.ctc_compress_strategy)
def __init__( self, conv_layers_before=None, input_size=83, embed_dim=512, convolutions=((512, 3), ) * 20, dropout=0.1, ): super(FConvEncoder, self).__init__(None) # no src dictionary self.dropout = dropout self.num_attention_layers = None self.conv_layers_before = conv_layers_before self.fc0 = Linear(input_size, embed_dim, dropout=dropout) \ if input_size != embed_dim else None convolutions = extend_conv_spec(convolutions) in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.residuals = [] layer_in_channels = [in_channels] for _, (out_channels, kernel_size, residual) in enumerate(convolutions): if residual == 0: residual_dim = out_channels else: residual_dim = layer_in_channels[-residual] self.projections.append( Linear(residual_dim, out_channels ) if residual_dim != out_channels else None) if kernel_size % 2 == 1: padding = kernel_size // 2 else: padding = 0 self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout, padding=padding)) self.residuals.append(residual) in_channels = out_channels layer_in_channels.append(out_channels) self.fc2 = Linear(in_channels, embed_dim)