def __init__( self, feat_in: int, num_classes: int, init_mode: Optional[str] = "xavier_uniform", return_logits: bool = True, pooling_type='avg', ): super().__init__() self._feat_in = feat_in self._return_logits = return_logits self._num_classes = num_classes if pooling_type == 'avg': self.pooling = torch.nn.AdaptiveAvgPool1d(1) elif pooling_type == 'max': self.pooling = torch.nn.AdaptiveMaxPool1d(1) else: raise ValueError( 'Pooling type chosen is not valid. Must be either `avg` or `max`' ) self.decoder_layers = torch.nn.Sequential( torch.nn.Linear(self._feat_in, self._num_classes, bias=True)) self.apply(lambda x: init_weights(x, mode=init_mode))
def __init__( self, feat_in: int, filters: list, kernel_sizes: list, dilations: list, scale: int = 8, init_mode: str = 'xavier_uniform', ): super().__init__() self.layers = nn.ModuleList() self.layers.append( TDNNModule(feat_in, filters[0], kernel_size=kernel_sizes[0], dilation=dilations[0])) for i in range(len(filters) - 2): self.layers.append( TDNNSEModule( filters[i], filters[i + 1], group_scale=scale, se_channels=128, kernel_size=kernel_sizes[i + 1], dilation=dilations[i + 1], )) self.feature_agg = TDNNModule(filters[-1], filters[-1], kernel_sizes[-1], dilations[-1]) self.apply(lambda x: init_weights(x, mode=init_mode))
def __init__(self, feat_in, num_classes, init_mode="xavier_uniform", vocabulary=None): super().__init__() if vocabulary is None and num_classes < 0: raise ValueError( f"Neither of the vocabulary and num_classes are set! At least one of them need to be set." ) if num_classes <= 0: num_classes = len(vocabulary) logging.info(f"num_classes of ConvASRDecoder is set to the size of the vocabulary: {num_classes}.") if vocabulary is not None: if num_classes != len(vocabulary): raise ValueError( f"If vocabulary is specified, it's length should be equal to the num_classes. Instead got: num_classes={num_classes} and len(vocabulary)={len(vocabulary)}" ) self.__vocabulary = vocabulary self._feat_in = feat_in # Add 1 for blank char self._num_classes = num_classes + 1 self.decoder_layers = torch.nn.Sequential( torch.nn.Conv1d(self._feat_in, self._num_classes, kernel_size=1, bias=True) ) self.apply(lambda x: init_weights(x, mode=init_mode))
def __init__( self, inp_filters: int, out_filters: int, group_scale: int = 8, se_channels: int = 128, kernel_size: int = 1, dilation: int = 1, init_mode: str = 'xavier_uniform', ): super().__init__() self.out_filters = out_filters padding_val = get_same_padding(kernel_size=kernel_size, dilation=dilation, stride=1) group_conv = nn.Conv1d( out_filters, out_filters, kernel_size=kernel_size, dilation=dilation, padding=padding_val, groups=group_scale, ) self.group_tdnn_block = nn.Sequential( TDNNModule(inp_filters, out_filters, kernel_size=1, dilation=1), group_conv, nn.ReLU(), nn.BatchNorm1d(out_filters), TDNNModule(out_filters, out_filters, kernel_size=1, dilation=1), ) self.se_layer = MaskedSEModule(out_filters, se_channels, out_filters) self.apply(lambda x: init_weights(x, mode=init_mode))
def __init__( self, feat_in, feat_out, feat_hidden, stride_layers, kernel_size=11, init_mode="xavier_uniform", activation="relu", ): super().__init__() if stride_layers > 0 and (kernel_size < 3 or kernel_size % 2 == 0): raise ValueError( "Kernel size in this decoder needs to be >= 3 and odd when using at least 1 stride layer." ) activation = jasper_activations[activation]() self.feat_in = feat_in self.feat_out = feat_out self.feat_hidden = feat_hidden self.decoder_layers = [nn.Conv1d(self.feat_in, self.feat_hidden, kernel_size=1, bias=True)] for i in range(stride_layers): self.decoder_layers.append(activation) self.decoder_layers.append( nn.ConvTranspose1d( self.feat_hidden, self.feat_hidden, kernel_size, stride=2, padding=(kernel_size - 3) // 2 + 1, output_padding=1, bias=True, ) ) self.decoder_layers.append(nn.Conv1d(self.feat_hidden, self.feat_hidden, kernel_size=1, bias=True)) self.decoder_layers.append(nn.BatchNorm1d(self.feat_hidden, eps=1e-3, momentum=0.1)) self.decoder_layers.append(activation) self.decoder_layers.append(nn.Conv1d(self.feat_hidden, self.feat_out, kernel_size=1, bias=True)) self.decoder_layers = nn.Sequential(*self.decoder_layers) self.apply(lambda x: init_weights(x, mode=init_mode))
def __init__(self, feat_in, num_classes, init_mode="xavier_uniform", vocabulary=None): super().__init__() if vocabulary is not None: if num_classes != len(vocabulary): raise ValueError( f"If vocabulary is specified, it's length should be equal to the num_classes. Instead got: num_classes={num_classes} and len(vocabulary)={len(vocabulary)}" ) self.__vocabulary = vocabulary self._feat_in = feat_in # Add 1 for blank char self._num_classes = num_classes + 1 self.decoder_layers = torch.nn.Sequential( torch.nn.Conv1d(self._feat_in, self._num_classes, kernel_size=1, bias=True) ) self.apply(lambda x: init_weights(x, mode=init_mode))
def __init__( self, feat_in: int, num_classes: int, emb_sizes: Optional[Union[int, list]] = 256, pool_mode: str = 'xvector', angular: bool = False, attention_channels: int = 128, init_mode: str = "xavier_uniform", ): super().__init__() self.angular = angular self.emb_id = 2 bias = False if self.angular else True emb_sizes = [emb_sizes] if type(emb_sizes) is int else emb_sizes self._num_classes = num_classes self.pool_mode = pool_mode.lower() if self.pool_mode == 'xvector' or self.pool_mode == 'tap': self._pooling = StatsPoolLayer(feat_in=feat_in, pool_mode=self.pool_mode) affine_type = 'linear' elif self.pool_mode == 'attention': self._pooling = AttentivePoolLayer( inp_filters=feat_in, attention_channels=attention_channels) affine_type = 'conv' shapes = [self._pooling.feat_in] for size in emb_sizes: shapes.append(int(size)) emb_layers = [] for shape_in, shape_out in zip(shapes[:-1], shapes[1:]): layer = self.affine_layer(shape_in, shape_out, learn_mean=False, affine_type=affine_type) emb_layers.append(layer) self.emb_layers = nn.ModuleList(emb_layers) self.final = nn.Linear(shapes[-1], self._num_classes, bias=bias) self.apply(lambda x: init_weights(x, mode=init_mode))
def __init__( self, feat_in, num_classes, emb_sizes=None, pool_mode='xvector', angular=False, init_mode="xavier_uniform", ): super().__init__() self.angular = angular self.emb_id = 2 if self.angular: bias = False else: bias = True if type(emb_sizes) is str: emb_sizes = emb_sizes.split(',') elif type(emb_sizes) is int: emb_sizes = [emb_sizes] else: emb_sizes = [512, 512] self.input_feat_in = feat_in self._num_classes = num_classes self._pooling = StatsPoolLayer(feat_in=feat_in, pool_mode=pool_mode) self._feat_in = self._pooling.feat_in shapes = [self._feat_in] for size in emb_sizes: shapes.append(int(size)) emb_layers = [] for shape_in, shape_out in zip(shapes[:-1], shapes[1:]): layer = self.affineLayer(shape_in, shape_out, learn_mean=False) emb_layers.append(layer) self.emb_layers = nn.ModuleList(emb_layers) self.final = nn.Linear(shapes[-1], self._num_classes, bias=bias) self.apply(lambda x: init_weights(x, mode=init_mode))
def __init__( self, jasper, activation: str, feat_in: int, normalization_mode: str = "batch", residual_mode: str = "add", norm_groups: int = -1, conv_mask: bool = True, frame_splicing: int = 1, init_mode: Optional[str] = 'xavier_uniform', aggregation_mode: Optional[str] = None, quantize: bool = False, ): super().__init__() if isinstance(jasper, ListConfig): jasper = OmegaConf.to_container(jasper) activation = jasper_activations[activation]() feat_in = feat_in * frame_splicing self._feat_in = feat_in residual_panes = [] encoder_layers = [] self.dense_residual = False for lcfg in jasper: dense_res = [] if lcfg.get('residual_dense', False): residual_panes.append(feat_in) dense_res = residual_panes self.dense_residual = True groups = lcfg.get('groups', 1) separable = lcfg.get('separable', False) heads = lcfg.get('heads', -1) residual_mode = lcfg.get('residual_mode', residual_mode) se = lcfg.get('se', False) se_reduction_ratio = lcfg.get('se_reduction_ratio', 8) se_context_window = lcfg.get('se_context_size', -1) se_interpolation_mode = lcfg.get('se_interpolation_mode', 'nearest') kernel_size_factor = lcfg.get('kernel_size_factor', 1.0) stride_last = lcfg.get('stride_last', False) aggregation_mode = lcfg.get('aggregation_mode', 'sum') block_dropout = lcfg.get('block_dropout', 0.0) parallel_residual_mode = lcfg.get('parallel_residual_mode', 'sum') parallel_blocks = [] for kernel_size in lcfg['kernel']: parallel_blocks.append( JasperBlock( feat_in, lcfg['filters'], repeat=lcfg['repeat'], kernel_size=[kernel_size], stride=lcfg['stride'], dilation=lcfg['dilation'], dropout=lcfg['dropout'], residual=lcfg['residual'], groups=groups, separable=separable, heads=heads, residual_mode=residual_mode, normalization=normalization_mode, norm_groups=norm_groups, activation=activation, residual_panes=dense_res, conv_mask=conv_mask, se=se, se_reduction_ratio=se_reduction_ratio, se_context_window=se_context_window, se_interpolation_mode=se_interpolation_mode, kernel_size_factor=kernel_size_factor, stride_last=stride_last, quantize=quantize, )) if len(parallel_blocks) == 1: encoder_layers.append(parallel_blocks[0]) else: encoder_layers.append( ParallelBlock( parallel_blocks, aggregation_mode=aggregation_mode, block_dropout_prob=block_dropout, residual_mode=parallel_residual_mode, in_filters=feat_in, out_filters=lcfg['filters'], )) feat_in = lcfg['filters'] self._feat_out = feat_in self.encoder = torch.nn.Sequential(*encoder_layers) self.apply(lambda x: init_weights(x, mode=init_mode))
def __init__( self, jasper, activation: str, feat_in: int, normalization_mode: str = "batch", residual_mode: str = "add", norm_groups: int = -1, conv_mask: bool = True, frame_splicing: int = 1, init_mode: Optional[str] = 'xavier_uniform', quantize: bool = False, ): super().__init__() if isinstance(jasper, ListConfig): jasper = OmegaConf.to_container(jasper) activation = jasper_activations[activation]() # If the activation can be executed in place, do so. if hasattr(activation, 'inplace'): activation.inplace = True feat_in = feat_in * frame_splicing self._feat_in = feat_in residual_panes = [] encoder_layers = [] self.dense_residual = False for lcfg in jasper: dense_res = [] if lcfg.get('residual_dense', False): residual_panes.append(feat_in) dense_res = residual_panes self.dense_residual = True groups = lcfg.get('groups', 1) separable = lcfg.get('separable', False) heads = lcfg.get('heads', -1) residual_mode = lcfg.get('residual_mode', residual_mode) se = lcfg.get('se', False) se_reduction_ratio = lcfg.get('se_reduction_ratio', 8) se_context_window = lcfg.get('se_context_size', -1) se_interpolation_mode = lcfg.get('se_interpolation_mode', 'nearest') kernel_size_factor = lcfg.get('kernel_size_factor', 1.0) stride_last = lcfg.get('stride_last', False) future_context = lcfg.get('future_context', -1) encoder_layers.append( JasperBlock( feat_in, lcfg['filters'], repeat=lcfg['repeat'], kernel_size=lcfg['kernel'], stride=lcfg['stride'], dilation=lcfg['dilation'], dropout=lcfg['dropout'], residual=lcfg['residual'], groups=groups, separable=separable, heads=heads, residual_mode=residual_mode, normalization=normalization_mode, norm_groups=norm_groups, activation=activation, residual_panes=dense_res, conv_mask=conv_mask, se=se, se_reduction_ratio=se_reduction_ratio, se_context_window=se_context_window, se_interpolation_mode=se_interpolation_mode, kernel_size_factor=kernel_size_factor, stride_last=stride_last, future_context=future_context, quantize=quantize, )) feat_in = lcfg['filters'] self._feat_out = feat_in self.encoder = torch.nn.Sequential(*encoder_layers) self.apply(lambda x: init_weights(x, mode=init_mode)) # Flag needed for RNNT export support self._rnnt_export = False