def plot_mel_masks(args): # Arugments & parameters workspace = args.workspace holdout_fold = args.holdout_fold scene_type = args.scene_type snr = args.snr iteration = args.iteration model_type = args.model_type cuda = args.cuda labels = config.labels classes_num = len(labels) sample_rate = config.sample_rate window_size = config.window_size overlap = config.overlap hop_size = window_size-overlap mel_bins = config.mel_bins seq_len = config.seq_len ix_to_lb = config.ix_to_lb thres = 0.1 batch_size = 24 # Paths hdf5_path = os.path.join(workspace, 'features', 'logmel', 'scene_type={},snr={}'.format(scene_type, snr), 'development.h5') model_path = os.path.join(workspace, 'models', 'main_pytorch', 'model_type={}'.format(model_type), 'scene_type={},snr={}' ''.format(scene_type, snr), 'holdout_fold{}'.format(holdout_fold), 'md_{}_iters.tar'.format(iteration)) yaml_path = os.path.join(workspace, 'mixture.yaml') audios_dir = os.path.join(workspace, 'mixed_audios', 'scene_type={},snr={}'.format(scene_type, snr)) sep_wavs_dir = os.path.join(workspace, 'separated_wavs', 'main_pytorch', 'model_type={}'.format(model_type), 'scene_type={},snr={}'.format(scene_type, snr), 'holdout_fold{}'.format(holdout_fold)) create_folder(sep_wavs_dir) # Load yaml file load_yaml_time = time.time() with open(yaml_path, 'r') as f: meta = yaml.load(f) print('Load yaml file time: {:.3f} s'.format(time.time() - load_yaml_time)) feature_extractor = LogMelExtractor( sample_rate=sample_rate, window_size=window_size, overlap=overlap, mel_bins=mel_bins) inverse_melW = feature_extractor.get_inverse_melW() # Load model Model = get_model(model_type) model = Model(classes_num, seq_len, mel_bins, cuda) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) if cuda: model.cuda() # Data generator generator = InferenceDataGenerator( hdf5_path=hdf5_path, batch_size=batch_size, holdout_fold=holdout_fold) generate_func = generator.generate_validate( data_type='validate', shuffle=False, max_iteration=None) # Evaluate on mini-batch for (iteration, data) in enumerate(generate_func): (batch_x, batch_y, batch_audio_names) = data batch_x = move_data_to_gpu(batch_x, cuda) # Predict with torch.no_grad(): model.eval() (batch_output, batch_bottleneck) = model( batch_x, return_bottleneck=True) batch_output = batch_output.data.cpu().numpy() '''(batch_size, classes_num)''' batch_bottleneck = batch_bottleneck.data.cpu().numpy() '''(batch_size, classes_num, seq_len, mel_bins)''' batch_pred_sed = np.mean(batch_bottleneck, axis=-1) batch_pred_sed = np.transpose(batch_pred_sed, (0, 2, 1)) '''(batch_size, seq_len, classes_num)''' batch_gt_masks = [] for n in range(len(batch_audio_names)): curr_meta = search_meta_by_mixture_name(meta, batch_audio_names[n]) curr_events = curr_meta['events'] pred_indexes = np.where(batch_output[n] > thres)[0] gt_indexes = get_ground_truth_indexes(curr_events) gt_sed = get_sed_from_meta(curr_events) # (seq_len, classes_num) pred_sed = np.zeros((seq_len, classes_num)) pred_sed[:, pred_indexes] = batch_pred_sed[n][:, pred_indexes] # (seq_len, classes_num) (events_stft, scene_stft, _) = generator.get_events_scene_mixture_stft(batch_audio_names[n]) events_stft = np.dot(events_stft, feature_extractor.melW) scene_stft = np.dot(scene_stft, feature_extractor.melW) gt_mask = ideal_binary_mask(events_stft, scene_stft) # (seq_len, fft_size) gt_masks = gt_mask[:, :, None] * gt_sed[:, None, :] # (seq_len, fft_size, classes_num) gt_masks = gt_masks.astype(np.float32) batch_gt_masks.append(gt_masks) pred_masks = batch_bottleneck[n].transpose(1, 2, 0) # (seq_len, fft_size, classes_num) # Save out separated audio if True: curr_audio_name = curr_meta['mixture_name'] audio_path = os.path.join(audios_dir, curr_audio_name) (mixed_audio, fs) = read_audio(audio_path, target_fs=sample_rate, mono=True) out_wav_path = os.path.join(sep_wavs_dir, curr_audio_name) write_audio(out_wav_path, mixed_audio, sample_rate) window = np.hamming(window_size) mixed_stft_cmplx = stft(x=mixed_audio, window_size=window_size, hop_size=hop_size, window=window, mode='complex') mixed_stft_cmplx = mixed_stft_cmplx[0 : seq_len, :] mixed_stft = np.abs(mixed_stft_cmplx) for k in gt_indexes: masked_stft = np.dot(pred_masks[:, :, k], inverse_melW) * mixed_stft masked_stft_cmplx = real_to_complex(masked_stft, mixed_stft_cmplx) frames = istft(masked_stft_cmplx) cola_constant = get_cola_constant(hop_size, window) sep_audio = overlap_add(frames, hop_size, cola_constant) sep_wav_path = os.path.join(sep_wavs_dir, '{}_{}.wav'.format(os.path.splitext(curr_audio_name)[0], ix_to_lb[k])) write_audio(sep_wav_path, sep_audio, sample_rate) print('Audio wrote to {}'.format(sep_wav_path)) # Visualize learned representations if True: for n in range(len(batch_output)): # Plot segmentation masks. (00013.wav is used for plot in the paper) print('audio_name: {}'.format(batch_audio_names[n])) print('target: {}'.format(batch_y[n])) target_labels = target_to_labels(batch_y[n], labels) print('target labels: {}'.format(target_labels)) (events_stft, scene_stft, _) = generator.get_events_scene_mixture_stft(batch_audio_names[n]) fig, axs = plt.subplots(7, 7, figsize=(15, 10)) for k in range(classes_num): axs[k // 6, k % 6].matshow(batch_bottleneck[n, k].T, origin='lower', aspect='auto', cmap='jet') if labels[k] in target_labels: color = 'r' else: color = 'k' axs[k // 6, k % 6].set_title(labels[k], color=color) axs[k // 6, k % 6].xaxis.set_ticks([]) axs[k // 6, k % 6].yaxis.set_ticks([]) axs[k // 6, k % 6].set_xlabel('time') axs[k // 6, k % 6].set_ylabel('mel bins') axs[6, 5].matshow(np.log(events_stft + 1e-8).T, origin='lower', aspect='auto', cmap='jet') axs[6, 5].set_title('Spectrogram (in log scale)') axs[6, 5].xaxis.set_ticks([0, 310]) axs[6, 5].xaxis.set_ticklabels(['0.0', '10.0 s']) axs[6, 5].xaxis.tick_bottom() axs[6, 5].yaxis.set_ticks([0, 1024]) axs[6, 5].yaxis.set_ticklabels(['0', '1025']) axs[6, 5].set_xlabel('time') axs[6, 5].set_ylabel('FFT bins') axs[6, 6].matshow(np.log(np.dot(events_stft, feature_extractor.melW) + 1e-8).T, origin='lower', aspect='auto', cmap='jet') axs[6, 6].set_title('Log mel pectrogram') axs[6, 6].xaxis.set_ticks([0, 310]) axs[6, 6].xaxis.set_ticklabels(['0.0', '10.0 s']) axs[6, 6].xaxis.tick_bottom() axs[6, 6].yaxis.set_ticks([0, 63]) axs[6, 6].yaxis.set_ticklabels(['0', '64']) axs[6, 6].set_xlabel('time') axs[6, 6].set_ylabel('mel bins') plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5) plt.show() # Plot frame-wise SED fig, ax = plt.subplots(1, 1, figsize=(4, 4)) score_mat = [] for k in range(classes_num): score = np.mean(batch_bottleneck[n, k], axis=-1) score_mat.append(score) score_mat = np.array(score_mat) ax.matshow(score_mat, origin='lower', aspect='auto', cmap='jet') ax.set_title('Frame-wise predictions') ax.xaxis.set_ticks([0, 310]) ax.xaxis.set_ticklabels(['0.0', '10.0 s']) ax.xaxis.tick_bottom() ax.set_xlabel('time') ax.yaxis.set_ticks(np.arange(classes_num)) ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small') ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3) plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5) plt.show() # Plot event-wise SED est_event_list = get_est_event_list(batch_pred_sed[n:n+1], batch_audio_names[n:n+1], labels) event_mat = event_list_to_matrix(est_event_list) fig, ax = plt.subplots(1, 1, figsize=(4, 4)) ax.matshow(event_mat.T, origin='lower', aspect='auto', cmap='jet') ax.set_title('Event-wise predictions') ax.xaxis.set_ticks([0, 310]) ax.xaxis.set_ticklabels(['0.0', '10.0 s']) ax.xaxis.tick_bottom() ax.set_xlabel('time') ax.yaxis.set_ticks(np.arange(classes_num)) ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small') ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3) plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5) plt.show() # Plot event-wise ground truth ref_event_list = get_ref_event_list(meta, batch_audio_names[n:n+1]) event_mat = event_list_to_matrix(ref_event_list) fig, ax = plt.subplots(1, 1, figsize=(4, 4)) ax.matshow(event_mat.T, origin='lower', aspect='auto', cmap='jet') ax.set_title('Event-wise ground truth') ax.xaxis.set_ticks([0, 310]) ax.xaxis.set_ticklabels(['0.0', '10.0 s']) ax.xaxis.tick_bottom() ax.set_xlabel('time') ax.yaxis.set_ticks(np.arange(classes_num)) ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small') ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3) plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5) plt.show()
def inference_wiener(args): workspace = args.workspace iter = args.iteration stack_num = args.stack_num filename = args.filename mini_num = args.mini_num visualize = args.visualize cuda = args.use_cuda and torch.cuda.is_available() print("cuda:", cuda) sample_rate = cfg.sample_rate fft_size = cfg.fft_size hop_size = cfg.hop_size window_type = cfg.window_type if window_type == 'hamming': window = np.hamming(fft_size) # Audio audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db" # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db" names = os.listdir(audio_dir) # Load model. target_type = ['speech', 'noise'] model_dict = {} for e in target_type: n_freq = 257 model = DNN(stack_num, n_freq) model_path = os.path.join(workspace, "models", filename, e, "md_%d_iters.tar" % iter) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) # Move model to GPU. if cuda: model.cuda() model.eval() model_dict[e] = model # Load scalar scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p") (mean_, std_) = cPickle.load(open(scalar_path, 'rb')) mean_ = move_data_to_gpu(mean_, cuda, volatile=True) std_ = move_data_to_gpu(std_, cuda, volatile=True) if mini_num > 0: n_every = len(names) / mini_num else: n_every = 1 out_wav_dir = os.path.join(workspace, "enh_wavs", filename) pp_data.create_folder(out_wav_dir) for (cnt, name) in enumerate(names): if cnt % n_every == 0: audio_path = os.path.join(audio_dir, name) (audio, _) = pp_data.read_audio(audio_path, sample_rate) audio = pp_data.normalize(audio) cmplx_sp = pp_data.calc_sp(audio, fft_size, hop_size, window) x = np.abs(cmplx_sp) # Process data. n_pad = (stack_num - 1) / 2 x = pp_data.pad_with_border(x, n_pad) x = pp_data.mat_2d_to_3d(x, stack_num, hop=1) # Predict. pred_dict = {} for e in target_type: pred = forward(model_dict[e], x, mean_, std_, cuda) pred = pred.data.cpu().numpy() pred_dict[e] = pred print(cnt, name) # Wiener filter. pred_mag_sp = pred_dict['speech'] / ( pred_dict['speech'] + pred_dict['noise']) * np.abs(cmplx_sp) pred_cmplx_sp = stft.real_to_complex(pred_mag_sp, cmplx_sp) frames = stft.istft(pred_cmplx_sp) cola_constant = stft.get_cola_constant(hop_size, window) seq = stft.overlap_add(frames, hop_size, cola_constant) seq = seq[0:len(audio)] # Write out wav out_wav_path = os.path.join(out_wav_dir, name) pp_data.write_audio(out_wav_path, seq, sample_rate) print("Write out wav to: %s" % out_wav_path) if visualize: vmin = -5. vmax = 5. fig, axs = plt.subplots(3, 1, sharex=True) axs[0].matshow(np.log(np.abs(cmplx_sp)).T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(np.log(np.abs(pred_dict['speech'])).T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(np.log(np.abs(pred_dict['noise'])).T, origin='lower', aspect='auto', cmap='jet') plt.show()