Пример #1
0
    def __init__(self,
                 num_classes,
                 embedding_size,
                 input_dim=24,
                 dropout_p=0.0,
                 **kwargs):
        super(ASTDNN, self).__init__()
        self.num_classes = num_classes
        self.dropout_p = dropout_p
        self.input_dim = input_dim

        self.frame1 = TimeDelayLayer_v2(input_dim=self.input_dim,
                                        output_dim=512,
                                        context_size=5,
                                        dilation=1,
                                        dropout_p=dropout_p)
        self.frame2 = TimeDelayLayer_v2(input_dim=512,
                                        output_dim=512,
                                        context_size=3,
                                        dilation=2,
                                        dropout_p=dropout_p)
        self.frame3 = TimeDelayLayer_v2(input_dim=512,
                                        output_dim=512,
                                        context_size=3,
                                        dilation=3,
                                        dropout_p=dropout_p)
        self.frame4 = TimeDelayLayer_v2(input_dim=512,
                                        output_dim=512,
                                        context_size=1,
                                        dilation=1,
                                        dropout_p=dropout_p)
        self.frame5 = TimeDelayLayer_v2(input_dim=512,
                                        output_dim=1500,
                                        context_size=1,
                                        dilation=1,
                                        dropout_p=dropout_p)

        self.attention_statistic = AttentionStatisticPooling(input_dim=1500,
                                                             hidden_dim=64)

        self.segment6 = nn.Sequential(nn.Linear(3000, 512), nn.ReLU(),
                                      nn.BatchNorm1d(512))

        self.segment7 = nn.Sequential(nn.Linear(512,
                                                embedding_size), nn.ReLU(),
                                      nn.BatchNorm1d(embedding_size))

        self.classifier = nn.Linear(embedding_size, num_classes)
        self.drop = nn.Dropout(p=self.dropout_p)

        # self.out_act = nn.Sigmoid()
        # self.relu = nn.LeakyReLU()
        for m in self.modules():  # 对于各层参数的初始化
            if isinstance(m, nn.BatchNorm1d):  # weight设置为1,bias为0
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, TimeDelayLayer_v2):
                nn.init.kaiming_normal_(m.kernel.weight,
                                        mode='fan_out',
                                        nonlinearity='relu')
Пример #2
0
    def __init__(self,
                 num_classes,
                 embedding_size,
                 input_dim,
                 alpha=0.,
                 input_norm='',
                 channels=[512, 512, 512, 512, 512, 1536],
                 context=[5, 3, 3, 5],
                 downsample=None,
                 resnet_size=17,
                 stride=[1],
                 dropout_p=0.0,
                 dropout_layer=False,
                 encoder_type='STAP',
                 block_type='Basic',
                 mask='None',
                 mask_len=20,
                 **kwargs):
        super(RET_v2, self).__init__()
        self.num_classes = num_classes
        self.dropout_p = dropout_p
        self.dropout_layer = dropout_layer
        self.input_dim = input_dim
        self.alpha = alpha
        self.mask = mask
        self.channels = channels
        self.context = context
        self.stride = stride
        if len(self.stride) == 1:
            while len(self.stride) < 4:
                self.stride.append(self.stride[0])

        self.tdnn_size = resnet_size
        tdnn_type = {14: [1, 1, 1, 0], 17: [1, 1, 1, 1]}
        self.layers = tdnn_type[
            resnet_size] if resnet_size in tdnn_type else tdnn_type[17]

        if input_norm == 'Instance':
            self.inst_layer = nn.InstanceNorm1d(input_dim)
        elif input_norm == 'Mean':
            self.inst_layer = Mean_Norm()
        else:
            self.inst_layer = None

        if self.mask == "time":
            self.maks_layer = TimeMaskLayer(mask_len=mask_len)
        elif self.mask == "freq":
            self.mask = FreqMaskLayer(mask_len=mask_len)
        elif self.mask == "time_freq":
            self.mask_layer = nn.Sequential(TimeMaskLayer(mask_len=mask_len),
                                            FreqMaskLayer(mask_len=mask_len))
        else:
            self.mask_layer = None

        TDNN_layer = TimeDelayLayer_v5
        if block_type == 'Basic':
            Blocks = TDNNBlock
        elif block_type == 'Basic_v6':
            Blocks = TDNNBlock_v6
            TDNN_layer = TimeDelayLayer_v6
        elif block_type == 'Agg':
            Blocks = TDNNBottleBlock
        elif block_type == 'cbam':
            Blocks = TDNNCBAMBlock
        else:
            raise ValueError(block_type)

        self.frame1 = TDNN_layer(input_dim=self.input_dim,
                                 output_dim=self.channels[0],
                                 context_size=5,
                                 dilation=1,
                                 stride=self.stride[0])
        self.frame2 = self._make_block(block=Blocks,
                                       inplanes=self.channels[0],
                                       planes=self.channels[0],
                                       downsample=downsample,
                                       dilation=1,
                                       blocks=self.layers[0])

        self.frame4 = TDNN_layer(input_dim=self.channels[0],
                                 output_dim=self.channels[1],
                                 context_size=3,
                                 dilation=1,
                                 stride=self.stride[1])
        self.frame5 = self._make_block(block=Blocks,
                                       inplanes=self.channels[1],
                                       planes=self.channels[1],
                                       downsample=downsample,
                                       dilation=1,
                                       blocks=self.layers[1])

        self.frame7 = TDNN_layer(input_dim=self.channels[1],
                                 output_dim=self.channels[2],
                                 context_size=3,
                                 dilation=1,
                                 stride=self.stride[2])
        self.frame8 = self._make_block(block=Blocks,
                                       inplanes=self.channels[2],
                                       planes=self.channels[2],
                                       downsample=downsample,
                                       dilation=1,
                                       blocks=self.layers[2])

        if self.layers[3] != 0:
            self.frame10 = TDNN_layer(input_dim=self.channels[2],
                                      output_dim=self.channels[3],
                                      context_size=5,
                                      dilation=1,
                                      stride=self.stride[3])
            self.frame11 = self._make_block(block=Blocks,
                                            inplanes=self.channels[3],
                                            planes=self.channels[3],
                                            downsample=downsample,
                                            dilation=1,
                                            blocks=self.layers[3])

        self.frame13 = TDNN_layer(input_dim=self.channels[3],
                                  output_dim=self.channels[4],
                                  context_size=1,
                                  dilation=1)
        self.frame14 = TDNN_layer(input_dim=self.channels[4],
                                  output_dim=self.channels[5],
                                  context_size=1,
                                  dilation=1)

        self.drop = nn.Dropout(p=self.dropout_p)

        if encoder_type == 'STAP':
            self.encoder = StatisticPooling(input_dim=self.channels[5])
        elif encoder_type == 'SASP':
            self.encoder = AttentionStatisticPooling(
                input_dim=self.channels[5], hidden_dim=512)
        else:
            raise ValueError(encoder_type)

        self.segment1 = nn.Sequential(nn.Linear(self.channels[5] * 2, 512),
                                      nn.ReLU(), nn.BatchNorm1d(512))

        self.segment2 = nn.Sequential(nn.Linear(512,
                                                embedding_size), nn.ReLU(),
                                      nn.BatchNorm1d(embedding_size))

        if self.alpha:
            self.l2_norm = L2_Norm(self.alpha)

        self.classifier = nn.Linear(embedding_size, num_classes)
        # self.bn = nn.BatchNorm1d(num_classes)

        for m in self.modules():  # 对于各层参数的初始化
            if isinstance(m, nn.BatchNorm1d):  # weight设置为1,bias为0
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, TimeDelayLayer_v5):
                # nn.init.normal(m.kernel.weight, mean=0., std=1.)
                nn.init.kaiming_normal_(m.kernel.weight,
                                        mode='fan_out',
                                        nonlinearity='relu')
Пример #3
0
    def __init__(self,
                 num_classes,
                 embedding_size,
                 input_dim,
                 alpha=0.,
                 input_norm='',
                 filter=None,
                 sr=16000,
                 feat_dim=64,
                 exp=False,
                 filter_fix=False,
                 dropout_p=0.0,
                 dropout_layer=False,
                 encoder_type='STAP',
                 num_classes_b=0,
                 block_type='basic',
                 first_2d=False,
                 stride=[1],
                 mask='None',
                 mask_len=20,
                 channels=[512, 512, 512, 512, 1500],
                 **kwargs):
        super(TDNN_v5, self).__init__()
        self.num_classes = num_classes
        self.num_classes_b = num_classes_b
        self.dropout_p = dropout_p
        self.dropout_layer = dropout_layer
        self.input_dim = input_dim
        self.channels = channels
        self.alpha = alpha
        self.mask = mask
        self.filter = filter
        self.feat_dim = feat_dim
        self.block_type = block_type.lower()
        self.stride = stride
        if len(self.stride) == 1:
            while len(self.stride) < 4:
                self.stride.append(self.stride[0])
        if np.sum((self.stride)) > 4:
            print('The stride for tdnn layers are: ', str(self.stride))

        if self.filter == 'fDLR':
            self.filter_layer = fDLR(input_dim=input_dim,
                                     sr=sr,
                                     num_filter=feat_dim,
                                     exp=exp,
                                     filter_fix=filter_fix)
        elif self.filter == 'fBLayer':
            self.filter_layer = fBLayer(input_dim=input_dim,
                                        sr=sr,
                                        num_filter=feat_dim,
                                        exp=exp,
                                        filter_fix=filter_fix)
        elif self.filter == 'fBPLayer':
            self.filter_layer = fBPLayer(input_dim=input_dim,
                                         sr=sr,
                                         num_filter=feat_dim,
                                         exp=exp,
                                         filter_fix=filter_fix)
        elif self.filter == 'fLLayer':
            self.filter_layer = fLLayer(input_dim=input_dim,
                                        num_filter=feat_dim,
                                        exp=exp)
        elif self.filter == 'Avg':
            self.filter_layer = nn.AvgPool2d(kernel_size=(1, 7), stride=(1, 3))
        else:
            self.filter_layer = None

        if input_norm == 'Instance':
            self.inst_layer = nn.InstanceNorm1d(input_dim)
        elif input_norm == 'Mean':
            self.inst_layer = Mean_Norm()
        else:
            self.inst_layer = None

        if self.mask == "time":
            self.maks_layer = TimeMaskLayer(mask_len=mask_len)
        elif self.mask == "freq":
            self.mask = FreqMaskLayer(mask_len=mask_len)
        elif self.mask == "time_freq":
            self.mask_layer = nn.Sequential(TimeMaskLayer(mask_len=mask_len),
                                            FreqMaskLayer(mask_len=mask_len))
        else:
            self.mask_layer = None

        if self.filter_layer != None:
            self.input_dim = feat_dim
        if self.block_type == 'basic':
            TDlayer = TimeDelayLayer_v5
        elif self.block_type == 'basic_v6':
            TDlayer = TimeDelayLayer_v6
        elif self.block_type == 'shuffle':
            TDlayer = ShuffleTDLayer
        else:
            raise ValueError(self.block_type)

        if not first_2d:
            self.frame1 = TimeDelayLayer_v5(input_dim=self.input_dim,
                                            output_dim=self.channels[0],
                                            context_size=5,
                                            stride=self.stride[0],
                                            dilation=1)
        else:
            self.frame1 = Conv2DLayer(input_dim=self.input_dim,
                                      output_dim=self.channels[0],
                                      stride=self.stride[0])
        self.frame2 = TDlayer(input_dim=self.channels[0],
                              output_dim=self.channels[1],
                              context_size=3,
                              stride=self.stride[1],
                              dilation=2)
        self.frame3 = TDlayer(input_dim=self.channels[1],
                              output_dim=self.channels[2],
                              context_size=3,
                              stride=self.stride[2],
                              dilation=3)
        self.frame4 = TDlayer(input_dim=self.channels[2],
                              output_dim=self.channels[3],
                              context_size=1,
                              stride=self.stride[0],
                              dilation=1)
        self.frame5 = TimeDelayLayer_v5(input_dim=self.channels[3],
                                        output_dim=self.channels[4],
                                        context_size=1,
                                        stride=self.stride[3],
                                        dilation=1)

        self.drop = nn.Dropout(p=self.dropout_p)

        if encoder_type == 'STAP':
            self.encoder = StatisticPooling(input_dim=self.channels[4])
            self.encoder_output = self.channels[4] * 2
        elif encoder_type == 'ASP':
            self.encoder = AttentionStatisticPooling(
                input_dim=self.channels[4], hidden_dim=self.channels[4])
            self.encoder_output = self.channels[4] * 2
        elif encoder_type == 'SAP':
            self.encoder = SelfAttentionPooling(input_dim=self.channels[4],
                                                hidden_dim=self.channels[4])
            self.encoder_output = self.channels[4]
        elif encoder_type == 'Ghos_v3':
            self.encoder = GhostVLAD_v3(num_clusters=self.num_classes_b,
                                        gost=1,
                                        dim=self.channels[4])
            self.encoder_output = self.channels[4] * 2
        else:
            raise ValueError(encoder_type)

        self.segment6 = nn.Sequential(nn.Linear(self.encoder_output, 512),
                                      nn.ReLU(), nn.BatchNorm1d(512))

        self.segment7 = nn.Sequential(nn.Linear(512,
                                                embedding_size), nn.ReLU(),
                                      nn.BatchNorm1d(embedding_size))

        if self.alpha:
            self.l2_norm = L2_Norm(self.alpha)

        self.classifier = nn.Linear(embedding_size, num_classes)
        # self.bn = nn.BatchNorm1d(num_classes)

        for m in self.modules():  # 对于各层参数的初始化
            if isinstance(m, nn.BatchNorm1d):  # weight设置为1,bias为0
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, TimeDelayLayer_v5):
                # nn.init.normal(m.kernel.weight, mean=0., std=1.)
                nn.init.kaiming_normal_(m.kernel.weight,
                                        mode='fan_out',
                                        nonlinearity='relu')
Пример #4
0
    def __init__(self,
                 num_classes,
                 embedding_size,
                 input_dim,
                 alpha=0.,
                 input_norm='',
                 dropout_p=0.0,
                 encoder_type='STAP',
                 **kwargs):
        super(TDNN_v4, self).__init__()
        self.num_classes = num_classes
        self.dropout_p = dropout_p
        self.input_dim = input_dim
        self.alpha = alpha

        if input_norm == 'Instance':
            self.inst_layer = nn.InstanceNorm1d(input_dim)
        elif input_norm == 'Mean':
            self.inst_layer = Mean_Norm()
        else:
            self.inst_layer = None

        self.frame1 = TimeDelayLayer_v4(input_dim=self.input_dim,
                                        output_dim=512,
                                        context_size=5,
                                        dilation=1)
        self.frame2 = TimeDelayLayer_v4(input_dim=512,
                                        output_dim=512,
                                        context_size=3,
                                        dilation=2)
        self.frame3 = TimeDelayLayer_v4(input_dim=512,
                                        output_dim=512,
                                        context_size=3,
                                        dilation=3)
        self.frame4 = TimeDelayLayer_v4(input_dim=512,
                                        output_dim=512,
                                        context_size=1,
                                        dilation=1)
        self.frame5 = TimeDelayLayer_v4(input_dim=512,
                                        output_dim=1500,
                                        context_size=1,
                                        dilation=1)
        self.drop = nn.Dropout(p=self.dropout_p)
        if encoder_type == 'STAP':
            self.encoder = StatisticPooling(input_dim=1500)
        elif encoder_type == 'SASP':
            self.encoder = AttentionStatisticPooling(input_dim=1500,
                                                     hidden_dim=512)
        else:
            raise ValueError(encoder_type)

        self.segment6 = nn.Sequential(nn.Linear(3000, 512), nn.ReLU(),
                                      nn.BatchNorm1d(512))

        self.segment7 = nn.Sequential(nn.Linear(512,
                                                embedding_size), nn.ReLU(),
                                      nn.BatchNorm1d(embedding_size))

        if self.alpha:
            self.l2_norm = L2_Norm(self.alpha)

        self.classifier = nn.Linear(embedding_size, num_classes)
        # self.bn = nn.BatchNorm1d(num_classes)

        for m in self.modules():  # 对于各层参数的初始化
            if isinstance(m, nn.BatchNorm1d):  # weight设置为1,bias为0
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, TimeDelayLayer_v2):
                # nn.init.normal(m.kernel.weight, mean=0., std=1.)
                nn.init.kaiming_normal_(m.kernel.weight,
                                        mode='fan_out',
                                        nonlinearity='relu')