def __init__(self, enc_dim, resnet_type='18', nclasses=2): self.in_planes = 16 super(ResNet, self).__init__() layers, block = RESNET_CONFIGS[resnet_type] self._norm_layer = torch_nn.BatchNorm2d # laye 1 self.conv1 = torch_nn.Conv2d(1, 16, kernel_size=(9, 3), stride=(3, 1), padding=(1, 1), bias=False) self.bn1 = torch_nn.BatchNorm2d(16) self.activation = torch_nn.ReLU() self.layer1 = self._make_layer(block, 64, layers[0], stride=1) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) self.conv5 = torch_nn.Conv2d(512 * block.expansion, 256, kernel_size=(3, 3), stride=(1, 1), padding=(0, 1), bias=False) self.bn5 = torch_nn.BatchNorm2d(256) self.fc = torch_nn.Linear(256 * 2, enc_dim) if nclasses >= 2: self.fc_mu = torch_nn.Linear(enc_dim, nclasses) else: self.fc_mu = torch_nn.Linear(enc_dim, 1) self.initialize_params() self.attention = nii_nn.SelfWeightedPooling(256)
def __init__(self, in_dim, out_dim, args, mean_std=None): super(Model, self).__init__() ##### required part, no need to change ##### # mean std of input and output in_m, in_s, out_m, out_s = self.prepare_mean_std(in_dim,out_dim,\ args, mean_std) self.input_mean = torch_nn.Parameter(in_m, requires_grad=False) self.input_std = torch_nn.Parameter(in_s, requires_grad=False) self.output_mean = torch_nn.Parameter(out_m, requires_grad=False) self.output_std = torch_nn.Parameter(out_s, requires_grad=False) # a flag for debugging (by default False) # self.model_debug = False # self.flag_validation = False ##### #### # on input waveform and output target #### # Load protocol and prepare the target data for network training protocol_file = prj_conf.optional_argument[0] self.protocol_parser = protocol_parse(protocol_file) # Working sampling rate # torchaudio may be used to change sampling rate self.m_target_sr = 16000 #### # optional configs (not used) #### # re-sampling (optional) #self.m_resampler = torchaudio.transforms.Resample( # prj_conf.wav_samp_rate, self.m_target_sr) # vad (optional) #self.m_vad = torchaudio.transforms.Vad(sample_rate = self.m_target_sr) # flag for balanced class (temporary use) #self.v_flag = 1 #### # front-end configuration # multiple front-end configurations may be used # by default, use a single front-end #### # frame shift (number of waveform points) self.frame_hops = [160] # frame length self.frame_lens = [320] # FFT length self.fft_n = [512] # LFB dim (base component) self.lfb_dim = [60] self.lfb_with_delta = False # window type self.win = torch.hann_window # floor in log-spectrum-amplitude calculating (not used) self.amp_floor = 0.00001 # number of frames to be kept for each trial # no truncation self.v_truncate_lens = [None for x in self.frame_hops] # number of sub-models (by default, a single model) self.v_submodels = len(self.frame_lens) # dimension of embedding vectors self.v_emd_dim = 64 # output classes self.v_out_class = 2 #### # create network #### # 1st part of the classifier self.m_transform = [] # pooling layer self.m_pooling = [] # 2nd part of the classifier self.m_output_act = [] # front-end self.m_frontend = [] # final part for output layer self.m_angle = [] # it can handle models with multiple front-end configuration # by default, only a single front-end for idx, (trunc_len, fft_n, lfb_dim) in enumerate( zip(self.v_truncate_lens, self.fft_n, self.lfb_dim)): fft_n_bins = fft_n // 2 + 1 if self.lfb_with_delta: lfb_dim = lfb_dim * 3 self.m_transform.append( torch_nn.Sequential( torch_nn.Conv2d(1, 64, [5, 5], 1, padding=[2, 2]), nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2], [2, 2]), torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 96, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2], [2, 2]), torch_nn.BatchNorm2d(48, affine=False), torch_nn.Conv2d(48, 96, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(48, affine=False), torch_nn.Conv2d(48, 128, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2], [2, 2]), torch_nn.Conv2d(64, 128, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(64, affine=False), torch_nn.Conv2d(64, 64, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 64, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch_nn.MaxPool2d([2, 2], [2, 2]), torch_nn.Dropout(0.7))) self.m_pooling.append( nii_nn.SelfWeightedPooling((lfb_dim // 16) * 32)) self.m_output_act.append( torch_nn.Linear((lfb_dim // 16) * 32 * 2, self.v_emd_dim)) self.m_angle.append( nii_amsoftmax.AMAngleLayer(self.v_emd_dim, self.v_out_class, s=10, m=0.35)) self.m_frontend.append( nii_front_end.LFB(self.frame_lens[idx], self.frame_hops[idx], self.fft_n[idx], self.m_target_sr, self.lfb_dim[idx], with_energy=False, with_emphasis=True, with_delta=self.lfb_with_delta)) self.m_frontend = torch_nn.ModuleList(self.m_frontend) self.m_transform = torch_nn.ModuleList(self.m_transform) self.m_output_act = torch_nn.ModuleList(self.m_output_act) self.m_pooling = torch_nn.ModuleList(self.m_pooling) self.m_angle = torch_nn.ModuleList(self.m_angle) # done return