def __init__(self, feat_dim, output_dim): super(Net, self).__init__() def get_tdnnf_layer(input_dim, layer_dim): return nn.Sequential( TDNNFBatchNorm( input_dim, layer_dim, context_len=3, orthonormal_constraint=-1.0, bypass_scale=0.75, bottleneck_dim=160, ), nn.Dropout(0.1)) self.tdnnf_layers = nn.Sequential( TDNNFBatchNorm(feat_dim, 1536, context_len=3, orthonormal_constraint=-1.0, bottleneck_dim=160), nn.Dropout(0.1), *[get_tdnnf_layer(1536, 1536) for i in range(2, 4)], TDNNFBatchNorm(1536, 1536, context_len=3, subsampling_factor=3, orthonormal_constraint=-1.0, bottleneck_dim=160), nn.Dropout(0.1), *[get_tdnnf_layer(1536, 1536) for i in range(5, 18)], OrthonormalLinear(1536, 256, scale=-1.0), nn.Dropout(0.1), ) self.chain_layers = nn.Sequential( OrthonormalLinear(256, 1536, scale=-1.0), nn.Dropout(0.1), OrthonormalLinear(1536, 256, scale=-1.0), nn.Dropout(0.1), NaturalAffineTransform(256, output_dim), ) self.chain_layers[-1].weight.data.zero_() self.chain_layers[-1].bias.data.zero_() self.xent_layers = nn.Sequential( OrthonormalLinear(256, 1536, scale=-1.0), nn.Dropout(0.1), OrthonormalLinear(1536, 256, scale=-1.0), nn.Dropout(0.1), NaturalAffineTransform(256, output_dim), ) self.xent_layers[-1].weight.data.zero_() self.xent_layers[-1].bias.data.zero_() self.output_dim = output_dim
def __init__(self, feat_dim, output_dim): super(Net, self).__init__() self.input_dim = feat_dim self.output_dim = output_dim self.tdnn = nn.Sequential( TDNNFBatchNorm(feat_dim, 512, 160, context_len=5, orthornomal_constraint=-1.0), TDNNFBatchNorm(512, 512, 160, context_len=3, orthornomal_constraint=-1.0), TDNNFBatchNorm(512, 512, 160, context_len=3, subsampling_factor=3, orthornomal_constraint=-1.0), TDNNFBatchNorm(512, 512, 160, context_len=3, orthornomal_constraint=-1.0), TDNNFBatchNorm(512, 512, 160, context_len=3, orthornomal_constraint=-1.0), TDNNFBatchNorm(512, 512, 160, context_len=3, orthornomal_constraint=-1.0), ) self.lstm = nn.LSTM(512, 256, 2, batch_first=True) self.chain = nn.Sequential( TDNNFBatchNorm(256, 256, 160, context_len=1, orthornomal_constraint=-1.0), NaturalAffineTransform(256, output_dim), ) self.xent = nn.Sequential( TDNNFBatchNorm(256, 256, 160, context_len=1, orthornomal_constraint=-1.0), NaturalAffineTransform(256, output_dim), ) self.chain[-1].weight.data.zero_() self.chain[-1].bias.data.zero_() self.xent[-1].weight.data.zero_() self.xent[-1].bias.data.zero_()
def __init__(self, feat_dim, output_dim): super(Net, self).__init__() self.input_dim = feat_dim self.output_dim = output_dim self.init_blstm = nn.LSTM(feat_dim, 256, 1, batch_first=True, bidirectional=True) self.final_blstm = nn.LSTM(512, 256, 4, batch_first=True, bidirectional=True) self.chain = nn.Sequential( TDNNFBatchNorm(512, 256, 160, context_len=1, orthonormal_constraint=-1.0), NaturalAffineTransform(256, output_dim), ) self.xent = nn.Sequential( TDNNFBatchNorm(512, 256, 160, context_len=1, orthonormal_constraint=-1.0), NaturalAffineTransform(256, output_dim), ) self.chain[-1].weight.data.zero_() self.chain[-1].bias.data.zero_() self.xent[-1].weight.data.zero_() self.xent[-1].bias.data.zero_()
def get_tdnnf_layer(input_dim, layer_dim): return nn.Sequential( TDNNFBatchNorm( input_dim, layer_dim, context_len=3, orthonormal_constraint=-1.0, bypass_scale=0.75, bottleneck_dim=160, ), nn.Dropout(0.1))
def __init__(self, feat_dim, output_dim, hidden_dim=1024, bottleneck_dim=128, prefinal_bottleneck_dim=256, kernel_size_list=[3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3], subsampling_factor_list=[1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1], frame_subsampling_factor=3, p_dropout=0.1): super().__init__() # at present, we support only frame_subsampling_factor to be 3 assert frame_subsampling_factor == 3 assert len(kernel_size_list) == len(subsampling_factor_list) num_layers = len(kernel_size_list) input_dim = feat_dim #input_dim = feat_dim * 3 + ivector_dim self.input_dim = input_dim self.output_dim = output_dim self.output_subsampling = frame_subsampling_factor # manually calculated self.padding = 27 self.frame_subsampling_factor = frame_subsampling_factor self.tdnn1 = TDNNFBatchNorm( input_dim, hidden_dim, bottleneck_dim=bottleneck_dim, context_len=kernel_size_list[0], subsampling_factor=subsampling_factor_list[0], orthonormal_constraint=-1.0, ) self.dropout1 = nn.Dropout(p_dropout) tdnnfs = [] for i in range(1, num_layers): kernel_size = kernel_size_list[i] subsampling_factor = subsampling_factor_list[i] layer = TDNNFBatchNorm( hidden_dim, hidden_dim, bottleneck_dim=bottleneck_dim, context_len=kernel_size, subsampling_factor=subsampling_factor, orthonormal_constraint=-1.0, ) tdnnfs.append(layer) dropout_layer = nn.Dropout(p_dropout) tdnnfs.append(dropout_layer) # tdnnfs requires [N, C, T] self.tdnnfs = nn.ModuleList(tdnnfs) # prefinal_l affine requires [N, C, T] self.prefinal_chain = TDNNFBatchNorm( hidden_dim, hidden_dim, bottleneck_dim=prefinal_bottleneck_dim, context_len=1, orthonormal_constraint=-1.0, ) self.prefinal_xent = TDNNFBatchNorm( hidden_dim, hidden_dim, bottleneck_dim=prefinal_bottleneck_dim, context_len=1, orthonormal_constraint=-1.0, ) self.chain_output = pkwrap.nn.NaturalAffineTransform( hidden_dim, output_dim) self.chain_output.weight.data.zero_() self.chain_output.bias.data.zero_() self.xent_output = pkwrap.nn.NaturalAffineTransform( hidden_dim, output_dim) self.xent_output.weight.data.zero_() self.xent_output.bias.data.zero_() self.validate_model()