def __init__(self, kernel_size, in_chan, n_src, bn_chan, chunk_size, hop_size=None, mask_act="relu"): super(SingleDecoder, self).__init__() self.kernel_size = kernel_size self.in_chan = in_chan self.bn_chan = bn_chan self.chunk_size = chunk_size hop_size = hop_size if hop_size is not None else chunk_size // 2 self.hop_size = hop_size self.n_src = n_src self.mask_act = mask_act # Masking in 3D space net_out_conv = nn.Conv2d(bn_chan, n_src * bn_chan, 1) self.first_out = nn.Sequential(nn.PReLU(), net_out_conv) # Gating and masking in 2D space (after fold) self.net_out = nn.Sequential(nn.Conv1d(bn_chan, bn_chan, 1), nn.Tanh()) self.net_gate = nn.Sequential(nn.Conv1d(bn_chan, bn_chan, 1), nn.Sigmoid()) self.mask_net = nn.Conv1d(bn_chan, in_chan, 1, bias=False) # Get activation function. mask_nl_class = activations.get(mask_act) # For softmax, feed the source dimension. if has_arg(mask_nl_class, "dim"): self.output_act = mask_nl_class(dim=1) else: self.output_act = mask_nl_class() _, self.trans_conv = make_enc_dec("free", kernel_size=kernel_size, n_filters=in_chan)
def __init__( self, in_chan, n_src, n_heads=4, ff_hid=256, chunk_size=100, hop_size=None, n_repeats=6, norm_type="gLN", ff_activation="relu", mask_act="relu", bidirectional=True, dropout=0, ): super(DPTransformer, self).__init__() self.in_chan = in_chan self.n_src = n_src self.n_heads = n_heads self.ff_hid = ff_hid self.chunk_size = chunk_size hop_size = hop_size if hop_size is not None else chunk_size // 2 self.hop_size = hop_size self.n_repeats = n_repeats self.n_src = n_src self.norm_type = norm_type self.ff_activation = ff_activation self.mask_act = mask_act self.bidirectional = bidirectional self.dropout = dropout self.mha_in_dim = ceil(self.in_chan / self.n_heads) * self.n_heads if self.in_chan % self.n_heads != 0: warnings.warn( f"DPTransformer input dim ({self.in_chan}) is not a multiple of the number of " f"heads ({self.n_heads}). Adding extra linear layer at input to accomodate " f"(size [{self.in_chan} x {self.mha_in_dim}])") self.input_layer = nn.Linear(self.in_chan, self.mha_in_dim) else: self.input_layer = None self.in_norm = norms.get(norm_type)(self.mha_in_dim) self.ola = DualPathProcessing(self.chunk_size, self.hop_size) # Succession of DPRNNBlocks. self.layers = nn.ModuleList([]) for x in range(self.n_repeats): self.layers.append( nn.ModuleList([ ImprovedTransformedLayer( self.mha_in_dim, self.n_heads, self.ff_hid, self.dropout, self.ff_activation, True, self.norm_type, ), ImprovedTransformedLayer( self.mha_in_dim, self.n_heads, self.ff_hid, self.dropout, self.ff_activation, self.bidirectional, self.norm_type, ), ])) net_out_conv = nn.Conv2d(self.mha_in_dim, n_src * self.in_chan, 1) self.first_out = nn.Sequential(nn.PReLU(), net_out_conv) # Gating and masking in 2D space (after fold) self.net_out = nn.Sequential(nn.Conv1d(self.in_chan, self.in_chan, 1), nn.Tanh()) self.net_gate = nn.Sequential(nn.Conv1d(self.in_chan, self.in_chan, 1), nn.Sigmoid()) # Get activation function. mask_nl_class = activations.get(mask_act) # For softmax, feed the source dimension. if has_arg(mask_nl_class, "dim"): self.output_act = mask_nl_class(dim=1) else: self.output_act = mask_nl_class()
def __init__( self, in_chan, n_src, n_heads=4, ff_hid=256, chunk_size=100, hop_size=None, n_repeats=6, norm_type="gLN", ff_activation="relu", mask_act="relu", bidirectional=True, dropout=0, ): super(DPTransformer, self).__init__() self.in_chan = in_chan self.n_src = n_src self.n_heads = n_heads self.ff_hid = ff_hid self.chunk_size = chunk_size hop_size = hop_size if hop_size is not None else chunk_size // 2 self.hop_size = hop_size self.n_repeats = n_repeats self.n_src = n_src self.norm_type = norm_type self.ff_activation = ff_activation self.mask_act = mask_act self.bidirectional = bidirectional self.dropout = dropout self.in_norm = norms.get(norm_type)(in_chan) # Succession of DPRNNBlocks. self.layers = nn.ModuleList([]) for x in range(self.n_repeats): self.layers.append( nn.ModuleList([ ImprovedTransformedLayer( self.in_chan, self.n_heads, self.ff_hid, self.dropout, self.ff_activation, True, self.norm_type, ), ImprovedTransformedLayer( self.in_chan, self.n_heads, self.ff_hid, self.dropout, self.ff_activation, self.bidirectional, self.norm_type, ), ])) net_out_conv = nn.Conv2d(self.in_chan, n_src * self.in_chan, 1) self.first_out = nn.Sequential(nn.PReLU(), net_out_conv) # Gating and masking in 2D space (after fold) self.net_out = nn.Sequential(nn.Conv1d(self.in_chan, self.in_chan, 1), nn.Tanh()) self.net_gate = nn.Sequential(nn.Conv1d(self.in_chan, self.in_chan, 1), nn.Sigmoid()) # Get activation function. mask_nl_class = activations.get(mask_act) # For softmax, feed the source dimension. if has_arg(mask_nl_class, "dim"): self.output_act = mask_nl_class(dim=1) else: self.output_act = mask_nl_class()
def __init__( self, in_chan, # encoder out channel 64 n_src, out_chan=None, bn_chan=64, n_heads=4, ff_hid=256, rnn_hid=128, rnn_layers=1, pe_conv_k=3, chunk_size=100, hop_size=None, # 50 n_repeats=6, # 2 norm_type="gLN", ff_activation="relu", mask_act="relu", # sigmoid bidirectional=True, dropout=0, ): super(DualTransformer, self).__init__() self.in_chan = in_chan out_chan = out_chan if out_chan is not None else in_chan self.out_chan = out_chan self.bn_chan = bn_chan self.n_src = n_src self.n_heads = n_heads self.ff_hid = ff_hid self.rnn_hid = rnn_hid self.rnn_layers = rnn_layers self.chunk_size = chunk_size hop_size = hop_size if hop_size is not None else chunk_size // 2 self.hop_size = hop_size self.n_repeats = n_repeats self.n_src = n_src self.norm_type = norm_type self.ff_activation = ff_activation self.mask_act = mask_act self.bidirectional = bidirectional self.dropout = dropout # mean, var for the whole sequence and channel, but gamma beta only for channel size # gln vs cln: on whole sequence or separately # self.in_norm = norms.get(norm_type)(in_chan) layer_norm = norms.get(norm_type)(in_chan) bottleneck_conv = nn.Conv1d(in_chan, bn_chan, 1) self.bottleneck = nn.Sequential(layer_norm, bottleneck_conv) pe_conv_list = [] for i in range(pe_conv_k): pe_conv_list.append( nn.Conv2d(bn_chan, bn_chan, kernel_size=3, stride=1, padding=1, bias=False)) pe_conv_list.append(norms.get(norm_type)(bn_chan)) pe_conv_list.append(activations.get(ff_activation)()) self.pe_conv = nn.Sequential(*pe_conv_list) d_model = self.bn_chan # # *2 for PE # self.pe = PositionalEmbedding(in_chan) # d_model = self.in_chan * 2 # Succession of DPRNNBlocks. self.layers = nn.ModuleList([]) for x in range(self.n_repeats): self.layers.append( nn.ModuleList([ # ImprovedTransformedLayer( # d_model, # self.n_heads, # self.ff_hid, # self.dropout, # self.ff_activation, # True, # self.norm_type, # ), # ImprovedTransformedLayer( # d_model, # self.n_heads, # self.ff_hid, # self.dropout, # self.ff_activation, # self.bidirectional, # self.norm_type, # ), SingleRNNBlock( in_chan=d_model, hid_size=self.rnn_hid, norm_type=self.norm_type, bidirectional=self.bidirectional, rnn_type='LSTM', num_layers=1, dropout=self.dropout, ), # DualTransformedLayer( # d_model, # self.n_heads, # self.ff_hid, # self.dropout, # self.ff_activation, # self.norm_type, # ), AcousticTransformerLayer( d_model, self.n_heads, self.ff_hid, self.dropout, self.ff_activation, self.norm_type, ), ])) # self.layers.append( # nn.ModuleList( # [ # DualTransformedLayer( # d_model, # self.n_heads, # self.ff_hid, # self.dropout, # self.ff_activation, # self.norm_type, # ), # DualTransformedLayer( # d_model, # self.n_heads, # self.ff_hid, # self.dropout, # self.ff_activation, # self.norm_type, # ), # ] # ) # ) # 1x1 conv # *2 for PE self.strnn_norm_out = norms.get(norm_type)(self.bn_chan) net_out_conv = nn.Conv2d(d_model, n_src * self.bn_chan, 1) self.first_out = nn.Sequential(nn.PReLU(), net_out_conv) # Gating and masking in 2D space (after fold) self.net_out = nn.Sequential(nn.Conv1d(self.bn_chan, self.bn_chan, 1), nn.Tanh()) self.net_gate = nn.Sequential(nn.Conv1d(self.bn_chan, self.bn_chan, 1), nn.Sigmoid()) self.mask_net = nn.Conv1d(bn_chan, out_chan, 1, bias=False) # Get activation function. mask_nl_class = activations.get(mask_act) # For softmax, feed the source dimension. if has_arg(mask_nl_class, "dim"): self.output_act = mask_nl_class(dim=1) else: self.output_act = mask_nl_class()