def __model(self, tf_mix, tf_target, tf_lr): # define model flow # stft stft_module = STFT_Module( frame_length=self.stft_params["frame_length"], frame_step=self.stft_params["frame_step"], fft_length=self.stft_params["fft_length"], epsilon=self.epsilon, pad_end=self.stft_params["pad_end"]) mr_stft_module = STFT_Module( frame_length=self.mr_stft_params["frame_length"], frame_step=self.mr_stft_params["frame_step"], fft_length=self.mr_stft_params["fft_length"], epsilon=self.epsilon, pad_end=self.mr_stft_params["pad_end"]) # mix data transform tf_spec_mix = stft_module.STFT(tf_mix) tf_amp_spec_mix = stft_module.to_amp_spec(tf_spec_mix, normalize=False) tf_mag_spec_mix = tf.log(tf_amp_spec_mix + self.epsilon) tf_mag_spec_mix = tf.expand_dims(tf_mag_spec_mix, -1) # (Batch, Time, Freq, Channel)) tf_amp_spec_mix = tf.expand_dims(tf_amp_spec_mix, -1) tf_f_512_mag_spec_mix = stft_module.to_F_512(tf_mag_spec_mix) #mr mix data transform tf_mr_spec_mix = mr_stft_module.STFT(tf_mix) tf_mr_spec_mix = tf_mr_spec_mix[:, 1:513, :] tf_mr_amp_spec_mix = stft_module.to_amp_spec(tf_mr_spec_mix, normalize=False) tf_mr_mag_spec_mix = tf.log(tf_mr_amp_spec_mix + self.epsilon) tf_mr_mag_spec_mix = tf.expand_dims( tf_mr_mag_spec_mix, -1) # (Batch, Time, Freq, Channel)) tf_mr_f_256_mag_spec_mix = tf_mr_mag_spec_mix[:, :, :256] # target data transform tf_spec_target = stft_module.STFT(tf_target) tf_amp_spec_target = stft_module.to_amp_spec(tf_spec_target, normalize=False) tf_amp_spec_target = tf.expand_dims(tf_amp_spec_target, -1) mr_u_net_ver2 = MRUNet_ver2( input_shape=(tf_f_512_mag_spec_mix.shape[1:]), mr_input_shape=(tf_mr_f_256_mag_spec_mix.shape[1:])) tf_est_masks = mr_u_net_ver2(tf_f_512_mag_spec_mix, tf_mr_f_256_mag_spec_mix) #F: 512 → 513 zero_pad = tf.zeros_like(tf_mag_spec_mix) zero_pad = tf.expand_dims(zero_pad[:, :, 1, :], -1) tf_est_masks = tf.concat([tf_est_masks, zero_pad], 2) tf_ora_masks = Masks.iaf(tf_amp_spec_mix, tf_amp_spec_target, self.epsilon) tf_loss = 10 * Loss.mean_square_error(tf_est_masks, tf_ora_masks) tf_train_step = Trainer.Adam(tf_loss, tf_lr) return tf_train_step, tf_loss, tf_amp_spec_target, tf_mag_spec_mix, tf_spec_mix, tf_est_masks, tf_ora_masks
def __model(self, tf_mix, tf_target, tf_lr): # define model flow # stft stft_module = STFT_Module( frame_length=self.stft_params["frame_length"], frame_step=self.stft_params["frame_step"], fft_length=self.stft_params["fft_length"], epsilon=self.epsilon, pad_end=self.stft_params["pad_end"]) mr2_stft_module = STFT_Module( frame_length=self.mr2_stft_params["frame_length"], frame_step=self.mr2_stft_params["frame_step"], fft_length=self.mr2_stft_params["fft_length"], epsilon=self.epsilon, pad_end=self.mr2_stft_params["pad_end"]) # mix data transform tf_spec_mix = stft_module.STFT(tf_mix) tf_amp_spec_mix = stft_module.to_amp_spec(tf_spec_mix, normalize=False) tf_mag_spec_mix = tf.log(tf_amp_spec_mix + self.epsilon) tf_mag_spec_mix = tf.expand_dims(tf_mag_spec_mix, -1) # (Batch, Time, Freq, Channel)) tf_amp_spec_mix = tf.expand_dims(tf_amp_spec_mix, -1) tf_f_512_mag_spec_mix = stft_module.to_F_512(tf_mag_spec_mix) #mr2 mix data transform #zero pad to fit stft time length 128 mr2_zero_pad = tf.zeros_like(tf_mix) tf_mr2_mix = tf.concat( [mr2_zero_pad[:, :384], tf_mix, mr2_zero_pad[:, :384]], axis=1) tf_mr2_spec_mix = mr2_stft_module.STFT(tf_mr2_mix) tf_mr2_amp_spec_mix = stft_module.to_amp_spec(tf_mr2_spec_mix, normalize=False) tf_mr2_mag_spec_mix = tf.log(tf_mr2_amp_spec_mix + self.epsilon) tf_mr2_mag_spec_mix = tf.expand_dims(tf_mr2_mag_spec_mix, -1) tf_mr2_mag_spec_mix = tf_mr2_mag_spec_mix[:, :, :1024, :] # target data transform tf_spec_target = stft_module.STFT(tf_target) tf_amp_spec_target = stft_module.to_amp_spec(tf_spec_target, normalize=False) tf_amp_spec_target = tf.expand_dims(tf_amp_spec_target, -1) mini_u_net_ver4 = mini_UNet_ver4( input_shape=(tf_f_512_mag_spec_mix.shape[1:]), mr2_input_shape=(tf_mr2_mag_spec_mix.shape[1:])) tf_est_masks, _, _, _, _, _ = mini_u_net_ver4(tf_f_512_mag_spec_mix, tf_mr2_mag_spec_mix) #F: 512 → 513 zero_pad = tf.zeros_like(tf_mag_spec_mix) zero_pad = tf.expand_dims(zero_pad[:, :, 1, :], -1) tf_est_masks = tf.concat([tf_est_masks, zero_pad], 2) tf_est_spec = tf.math.multiply(tf_est_masks, tf_amp_spec_mix) tf_loss = 10 * Loss.mean_square_error(tf_est_spec, tf_amp_spec_target) tf_train_step = Trainer.Adam(tf_loss, tf_lr) return tf_train_step, tf_loss, tf_amp_spec_target, tf_mag_spec_mix, tf_spec_mix, tf_est_masks, tf_est_spec
def __model(self, tf_mix): # define model flow # stft stft_module = STFT_Module( frame_length = self.stft_params["frame_length"], frame_step= self.stft_params["frame_step"], fft_length = self.stft_params["fft_length"], epsilon = self.epsilon, pad_end = self.stft_params["pad_end"] ) mr_stft_module = STFT_Module( frame_length = self.mr_stft_params["frame_length"], frame_step= self.mr_stft_params["frame_step"], fft_length = self.mr_stft_params["fft_length"], epsilon = self.epsilon, pad_end = self.stft_params["pad_end"] ) # mix data transform tf_spec_mix = stft_module.STFT(tf_mix) tf_phase_mix = tf.sign(tf_spec_mix) tf_phase_mix = self.expand_channel(tf_phase_mix) tf_amp_spec_mix = stft_module.to_amp_spec(tf_spec_mix, normalize =False) tf_mag_spec_mix = tf.log(tf_amp_spec_mix + self.epsilon) tf_mag_spec_mix = tf.expand_dims(tf_mag_spec_mix, -1)# (Batch, Time, Freq, Channel)) tf_amp_spec_mix = tf.expand_dims(tf_amp_spec_mix, -1) tf_f_512_mag_spec_mix = stft_module.to_F_512(tf_mag_spec_mix) #mr mix data transform tf_mr_spec_mix = mr_stft_module.STFT(tf_mix) tf_mr_spec_mix = tf_mr_spec_mix[:, 1:513, :] tf_mr_amp_spec_mix = stft_module.to_amp_spec(tf_mr_spec_mix, normalize =False) tf_mr_mag_spec_mix = tf.log(tf_mr_amp_spec_mix + self.epsilon) tf_mr_mag_spec_mix = tf.expand_dims(tf_mr_mag_spec_mix, -1)# (Batch, Time, Freq, Channel)) tf_mr_f_256_mag_spec_mix = tf_mr_mag_spec_mix[:, :, :256] mini_u_net_ver2 = mini_UNet_ver2( input_shape = ( tf_f_512_mag_spec_mix.shape[1:] ), mr1_input_shape = ( tf_mr_f_256_mag_spec_mix.shape[1:] ) ) tf_est_masks,_,_,_,_,_ = mini_u_net_ver2(tf_f_512_mag_spec_mix, tf_mr_f_256_mag_spec_mix) #F: 512 → 513 zero_pad = tf.zeros_like(tf_mag_spec_mix) zero_pad = tf.expand_dims(zero_pad[:,:,1,:], -1) tf_est_masks = tf.concat( [tf_est_masks, zero_pad], 2) tf_est_spec = tf.math.multiply(tf_est_masks, tf_amp_spec_mix) tf_est_source_spec = tf.math.multiply(tf.complex(tf_est_spec, 0.), tf_phase_mix) tf_est_source_spec = tf.squeeze(tf_est_source_spec, axis=-1) est_source = stft_module.ISTFT(tf_est_source_spec) return est_source
def __model(self, tf_mix): # define model flow # stft stft_module = STFT_Module( frame_length=self.stft_params["frame_length"], frame_step=self.stft_params["frame_step"], fft_length=self.stft_params["fft_length"], pad_end=self.stft_params["pad_end"], epsilon=self.epsilon) # mix data transform tf_spec_mix = stft_module.STFT(tf_mix) print("spec mix", tf_spec_mix.dtype) tf_spec_mix = stft_module.to_T_256( tf_spec_mix) # cut time dimension to 256 for u-net architecture tf_phase_mix = tf.sign(tf_spec_mix) tf_phase_mix = self.expand_channel(tf_phase_mix) # tf_mag_spec_mix = stft_module.to_magnitude_spec(tf_spec_mix, normalize=False) tf_amp_spec_mix = stft_module.to_amp_spec(tf_spec_mix, normalize=False) tf_mag_spec_mix = tf.log(tf_amp_spec_mix + self.epsilon) tf_mag_spec_mix = tf.expand_dims(tf_mag_spec_mix, -1) # (Batch, Time, Freq, Channel)) tf_amp_spec_mix = tf.expand_dims(tf_amp_spec_mix, -1) tf_f_512_mag_spec_mix = stft_module.to_F_512(tf_mag_spec_mix) # target data transform # tf_spec_target = stft_module.STFT(tf_target) # tf_spec_target = stft_module.to_T_256(tf_spec_target) # cut time dimensiton to 256 for u-net architecture # tf_amp_spec_target = stft_module.to_amp_spec(tf_spec_target, normalize=False) # tf_amp_spec_target = tf.expand_dims(tf_amp_spec_target, -1) conv_ffn = Conv_FFN( input_shape=(tf_f_512_mag_spec_mix.shape[1:]), out_dim=512, h_dim=512, ) tf_est_masks = conv_ffn(tf_f_512_mag_spec_mix) #F: 512 → 513 zero_pad = tf.zeros_like(tf_mag_spec_mix) zero_pad = tf.expand_dims(zero_pad[:, :, 1, :], -1) tf_est_masks = tf.concat([tf_est_masks, zero_pad], 2) tf_est_spec = tf.math.multiply(tf_est_masks, tf_amp_spec_mix) tf_est_source_spec = tf.math.multiply(tf.complex(tf_est_spec, 0.), tf_phase_mix) tf_est_source_spec = tf.squeeze(tf_est_source_spec, axis=-1) est_source = stft_module.ISTFT(tf_est_source_spec) return est_source
def __model(self, tf_mix, tf_target, tf_lr): # define model flow # stft stft_module = STFT_Module( frame_length=self.stft_params["frame_length"], frame_step=self.stft_params["frame_step"], fft_length=self.stft_params["fft_length"], epsilon=self.epsilon, pad_end=self.stft_params["pad_end"]) # mix data transform tf_spec_mix = stft_module.STFT(tf_mix) # tf_mag_spec_mix = stft_module.to_magnitude_spec(tf_spec_mix, normalize=False) tf_amp_spec_mix = stft_module.to_amp_spec(tf_spec_mix, normalize=False) tf_mag_spec_mix = tf.log(tf_amp_spec_mix + self.epsilon) tf_mag_spec_mix = tf.expand_dims(tf_mag_spec_mix, -1) # (Batch, Time, Freq, Channel)) tf_amp_spec_mix = tf.expand_dims(tf_amp_spec_mix, -1) tf_f_512_mag_spec_mix = stft_module.to_F_512(tf_mag_spec_mix) # target data transform tf_spec_target = stft_module.STFT(tf_target) tf_amp_spec_target = stft_module.to_amp_spec(tf_spec_target, normalize=False) tf_amp_spec_target = tf.expand_dims(tf_amp_spec_target, -1) conv_ffn = Conv_FFN( input_shape=(tf_f_512_mag_spec_mix.shape[1:]), out_dim=512, h_dim=512, ) tf_est_masks = conv_ffn(tf_f_512_mag_spec_mix) #F: 512 → 513 zero_pad = tf.zeros_like(tf_mag_spec_mix) zero_pad = tf.expand_dims(zero_pad[:, :, 1, :], -1) tf_est_masks = tf.concat([tf_est_masks, zero_pad], 2) tf_est_spec = tf.math.multiply(tf_est_masks, tf_amp_spec_mix) tf_loss = 10 * Loss.mean_square_error(tf_est_spec, tf_amp_spec_target) tf_train_step = Trainer.Adam(tf_loss, tf_lr) return tf_train_step, tf_loss, tf_amp_spec_target, tf_mag_spec_mix, tf_spec_mix, tf_est_masks, tf_est_spec
def __model(self, tf_mix): # define model flow # stft stft_module = STFT_Module( frame_length=self.stft_params["frame_length"], frame_step=self.stft_params["frame_step"], fft_length=self.stft_params["fft_length"], epsilon=self.epsilon, pad_end=self.stft_params["pad_end"]) mr1_stft_module = STFT_Module( frame_length=self.mr1_stft_params["frame_length"], frame_step=self.mr1_stft_params["frame_step"], fft_length=self.mr1_stft_params["fft_length"], epsilon=self.epsilon, pad_end=self.mr1_stft_params["pad_end"]) mr2_stft_module = STFT_Module( frame_length=self.mr2_stft_params["frame_length"], frame_step=self.mr2_stft_params["frame_step"], fft_length=self.mr2_stft_params["fft_length"], epsilon=self.epsilon, pad_end=self.mr2_stft_params["pad_end"]) # mix data transform tf_spec_mix = stft_module.STFT(tf_mix) tf_phase_mix = tf.sign(tf_spec_mix) # tf_phase_mix = self.expand_channel(tf_phase_mix) tf_amp_spec_mix = stft_module.to_amp_spec(tf_spec_mix, normalize=False) tf_mag_spec_mix = tf.log(tf_amp_spec_mix + self.epsilon) # tf_mag_spec_mix = tf.expand_dims(tf_mag_spec_mix, -1)# (Batch, Time, Freq, Channel)) # tf_amp_spec_mix = tf.expand_dims(tf_amp_spec_mix, -1) tf_f_512_mag_spec_mix = stft_module.to_F_512(tf_mag_spec_mix) #mr 1mix data transform tf_mr1_spec_mix = mr1_stft_module.STFT(tf_mix) tf_mr1_spec_mix = tf_mr1_spec_mix[:, 1:513, :] tf_mr1_amp_spec_mix = stft_module.to_amp_spec(tf_mr1_spec_mix, normalize=False) tf_mr1_mag_spec_mix = tf.log(tf_mr1_amp_spec_mix + self.epsilon) # tf_mr1_mag_spec_mix = tf.expand_dims(tf_mr1_mag_spec_mix, -1)# (Batch, Time, Freq, Channel)) tf_mr1_f_256_mag_spec_mix = tf_mr1_mag_spec_mix[:, :, :256] #mr2 mix data transform #zero pad to fit stft time length 128 mr2_zero_pad = tf.zeros_like(tf_mix) tf_mr2_mix = tf.concat( [mr2_zero_pad[:, :384], tf_mix, mr2_zero_pad[:, :384]], axis=1) tf_mr2_spec_mix = mr2_stft_module.STFT(tf_mr2_mix) tf_mr2_amp_spec_mix = stft_module.to_amp_spec(tf_mr2_spec_mix, normalize=False) tf_mr2_mag_spec_mix = tf.log(tf_mr2_amp_spec_mix + self.epsilon) # tf_mr2_mag_spec_mix = tf.expand_dims(tf_mr2_mag_spec_mix, -1) tf_mr2_mag_spec_mix = tf_mr2_mag_spec_mix[:, :, :1024] ffn_ver2 = FFN_ver2( out_dim=512, h_dim=512, ) tf_est_masks = ffn_ver2(tf_f_512_mag_spec_mix, tf_mr1_f_256_mag_spec_mix, tf_mr2_mag_spec_mix) #F: 512 → 513 zero_pad = tf.zeros_like(tf_mag_spec_mix) zero_pad = tf.expand_dims(zero_pad[:, :, 1], -1) tf_est_masks = tf.concat([tf_est_masks, zero_pad], 2) tf_est_spec = tf.math.multiply(tf_est_masks, tf_amp_spec_mix) tf_est_source_spec = tf.math.multiply(tf.complex(tf_est_spec, 0.), tf_phase_mix) tf_est_source_spec = tf.squeeze(tf_est_source_spec, axis=-1) est_source = stft_module.ISTFT(tf_est_source_spec) return est_source
def __model(self, tf_mix, tf_target, tf_lr): stft_module = STFT_Module( frame_length=self.stft_params["frame_length"], frame_step=self.stft_params["frame_step"], fft_length=self.stft_params["fft_length"], epsilon=self.epsilon, pad_end=self.stft_params["pad_end"]) mr1_stft_module = STFT_Module( frame_length=self.mr1_stft_params["frame_length"], frame_step=self.mr1_stft_params["frame_step"], fft_length=self.mr1_stft_params["fft_length"], epsilon=self.epsilon, pad_end=self.mr1_stft_params["pad_end"]) mr2_stft_module = STFT_Module( frame_length=self.mr2_stft_params["frame_length"], frame_step=self.mr2_stft_params["frame_step"], fft_length=self.mr2_stft_params["fft_length"], epsilon=self.epsilon, pad_end=self.mr2_stft_params["pad_end"]) # mix data transform tf_spec_mix = stft_module.STFT(tf_mix) tf_amp_spec_mix = stft_module.to_amp_spec(tf_spec_mix, normalize=False) tf_mag_spec_mix = tf.log(tf_amp_spec_mix + self.epsilon) # tf_mag_spec_mix = tf.expand_dims(tf_mag_spec_mix, -1)# (Batch, Time, Freq, Channel)) # tf_amp_spec_mix = tf.expand_dims(tf_amp_spec_mix, -1) tf_f_512_mag_spec_mix = stft_module.to_F_512(tf_mag_spec_mix) #mr1 mix data transform tf_mr1_spec_mix = mr1_stft_module.STFT(tf_mix) tf_mr1_spec_mix = tf_mr1_spec_mix[:, 1:513, :] tf_mr1_amp_spec_mix = stft_module.to_amp_spec(tf_mr1_spec_mix, normalize=False) tf_mr1_mag_spec_mix = tf.log(tf_mr1_amp_spec_mix + self.epsilon) # tf_mr1_mag_spec_mix = tf.expand_dims(tf_mr1_mag_spec_mix, -1)# (Batch, Time, Freq, Channel)) tf_mr1_f_256_mag_spec_mix = tf_mr1_mag_spec_mix[:, :, :256] #mr2 mix data transform #zero pad to fit stft time length 128 mr2_zero_pad = tf.zeros_like(tf_mix) tf_mr2_mix = tf.concat( [mr2_zero_pad[:, :384], tf_mix, mr2_zero_pad[:, :384]], axis=1) tf_mr2_spec_mix = mr2_stft_module.STFT(tf_mr2_mix) tf_mr2_amp_spec_mix = stft_module.to_amp_spec(tf_mr2_spec_mix, normalize=False) tf_mr2_mag_spec_mix = tf.log(tf_mr2_amp_spec_mix + self.epsilon) # tf_mr2_mag_spec_mix = tf.expand_dims(tf_mr2_mag_spec_mix, -1) tf_mr2_mag_spec_mix = tf_mr2_mag_spec_mix[:, :, :1024] # target data transform tf_spec_target = stft_module.STFT(tf_target) tf_amp_spec_target = stft_module.to_amp_spec(tf_spec_target, normalize=False) # tf_amp_spec_target = tf.expand_dims(tf_amp_spec_target, -1) ffn_ver2 = FFN_ver2( out_dim=512, h_dim=512, ) tf_est_masks = ffn_ver2(tf_f_512_mag_spec_mix, tf_mr1_f_256_mag_spec_mix, tf_mr2_mag_spec_mix) #F: 512 → 513 zero_pad = tf.zeros_like(tf_mag_spec_mix) zero_pad = tf.expand_dims(zero_pad[:, :, 1], -1) tf_est_masks = tf.concat([tf_est_masks, zero_pad], 2) print("est_mask", tf_est_masks.shape) print("amp_spec_mix", tf_amp_spec_mix.shape) tf_est_spec = tf.math.multiply(tf_est_masks, tf_amp_spec_mix) tf_loss = 10 * Loss.mean_square_error(tf_est_spec, tf_amp_spec_target) tf_train_step = Trainer.Adam(tf_loss, tf_lr) return tf_train_step, tf_loss, tf_amp_spec_target, tf_mag_spec_mix, tf_spec_mix, tf_est_masks, tf_est_spec