def __init__(self, enc_dim, resnet_type='18', nclasses=2):
        self.in_planes = 16
        super(ResNet, self).__init__()

        layers, block = RESNET_CONFIGS[resnet_type]

        self._norm_layer = torch_nn.BatchNorm2d

        # laye 1
        self.conv1 = torch_nn.Conv2d(1,
                                     16,
                                     kernel_size=(9, 3),
                                     stride=(3, 1),
                                     padding=(1, 1),
                                     bias=False)
        self.bn1 = torch_nn.BatchNorm2d(16)
        self.activation = torch_nn.ReLU()

        self.layer1 = self._make_layer(block, 64, layers[0], stride=1)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.conv5 = torch_nn.Conv2d(512 * block.expansion,
                                     256,
                                     kernel_size=(3, 3),
                                     stride=(1, 1),
                                     padding=(0, 1),
                                     bias=False)

        self.bn5 = torch_nn.BatchNorm2d(256)
        self.fc = torch_nn.Linear(256 * 2, enc_dim)

        if nclasses >= 2:
            self.fc_mu = torch_nn.Linear(enc_dim, nclasses)
        else:
            self.fc_mu = torch_nn.Linear(enc_dim, 1)

        self.initialize_params()
        self.attention = nii_nn.SelfWeightedPooling(256)
    def __init__(self, in_dim, out_dim, args, mean_std=None):
        super(Model, self).__init__()

        ##### required part, no need to change #####

        # mean std of input and output
        in_m, in_s, out_m, out_s = self.prepare_mean_std(in_dim,out_dim,\
                                                         args, mean_std)
        self.input_mean = torch_nn.Parameter(in_m, requires_grad=False)
        self.input_std = torch_nn.Parameter(in_s, requires_grad=False)
        self.output_mean = torch_nn.Parameter(out_m, requires_grad=False)
        self.output_std = torch_nn.Parameter(out_s, requires_grad=False)

        # a flag for debugging (by default False)
        # self.model_debug = False
        # self.flag_validation = False
        #####

        ####
        # on input waveform and output target
        ####
        # Load protocol and prepare the target data for network training
        protocol_file = prj_conf.optional_argument[0]
        self.protocol_parser = protocol_parse(protocol_file)

        # Working sampling rate
        #  torchaudio may be used to change sampling rate
        self.m_target_sr = 16000

        ####
        # optional configs (not used)
        ####
        # re-sampling (optional)
        #self.m_resampler = torchaudio.transforms.Resample(
        #    prj_conf.wav_samp_rate, self.m_target_sr)

        # vad (optional)
        #self.m_vad = torchaudio.transforms.Vad(sample_rate = self.m_target_sr)

        # flag for balanced class (temporary use)
        #self.v_flag = 1

        ####
        # front-end configuration
        #  multiple front-end configurations may be used
        #  by default, use a single front-end
        ####
        # frame shift (number of waveform points)
        self.frame_hops = [160]
        # frame length
        self.frame_lens = [320]
        # FFT length
        self.fft_n = [512]

        # LFB dim (base component)
        self.lfb_dim = [60]
        self.lfb_with_delta = False

        # window type
        self.win = torch.hann_window
        # floor in log-spectrum-amplitude calculating (not used)
        self.amp_floor = 0.00001

        # number of frames to be kept for each trial
        # no truncation
        self.v_truncate_lens = [None for x in self.frame_hops]

        # number of sub-models (by default, a single model)
        self.v_submodels = len(self.frame_lens)

        # dimension of embedding vectors
        self.v_emd_dim = 64

        # output classes
        self.v_out_class = 2

        ####
        # create network
        ####
        # 1st part of the classifier
        self.m_transform = []
        # pooling layer
        self.m_pooling = []
        # 2nd part of the classifier
        self.m_output_act = []
        # front-end
        self.m_frontend = []
        # final part for output layer
        self.m_angle = []

        # it can handle models with multiple front-end configuration
        # by default, only a single front-end
        for idx, (trunc_len, fft_n, lfb_dim) in enumerate(
                zip(self.v_truncate_lens, self.fft_n, self.lfb_dim)):

            fft_n_bins = fft_n // 2 + 1
            if self.lfb_with_delta:
                lfb_dim = lfb_dim * 3

            self.m_transform.append(
                torch_nn.Sequential(
                    torch_nn.Conv2d(1, 64, [5, 5], 1, padding=[2, 2]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 96, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.BatchNorm2d(48, affine=False),
                    torch_nn.Conv2d(48, 96, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(48, affine=False),
                    torch_nn.Conv2d(48, 128, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Conv2d(64, 128, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(64, affine=False),
                    torch_nn.Conv2d(64, 64, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 64, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch_nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Dropout(0.7)))

            self.m_pooling.append(
                nii_nn.SelfWeightedPooling((lfb_dim // 16) * 32))

            self.m_output_act.append(
                torch_nn.Linear((lfb_dim // 16) * 32 * 2, self.v_emd_dim))

            self.m_angle.append(
                nii_amsoftmax.AMAngleLayer(self.v_emd_dim,
                                           self.v_out_class,
                                           s=10,
                                           m=0.35))

            self.m_frontend.append(
                nii_front_end.LFB(self.frame_lens[idx],
                                  self.frame_hops[idx],
                                  self.fft_n[idx],
                                  self.m_target_sr,
                                  self.lfb_dim[idx],
                                  with_energy=False,
                                  with_emphasis=True,
                                  with_delta=self.lfb_with_delta))

        self.m_frontend = torch_nn.ModuleList(self.m_frontend)
        self.m_transform = torch_nn.ModuleList(self.m_transform)
        self.m_output_act = torch_nn.ModuleList(self.m_output_act)
        self.m_pooling = torch_nn.ModuleList(self.m_pooling)
        self.m_angle = torch_nn.ModuleList(self.m_angle)

        # done
        return