def calculate_logmel(audio_path, sample_rate, feature_extractor): # Read audio (audio, fs) = read_audio(audio_path, target_fs=sample_rate, mono=False) events_audio = audio[:, 0] scene_audio = audio[:, 1] mixed_audio = np.mean(audio, axis=-1) '''We do not divide the maximum value of an audio here because we assume the low energy of an audio may also contain information of a scene. ''' # Extract feature mixture_logmel = feature_extractor.transform(mixed_audio) mixture_stft = feature_extractor.transform_stft(mixed_audio) events_stft = feature_extractor.transform_stft(events_audio) scene_stft = feature_extractor.transform_stft(scene_audio) dict = { 'mixture_logmel': mixture_logmel, 'mixture_stft': mixture_stft, 'events_stft': events_stft, 'scene_stft': scene_stft } return dict
def calculate_logmel(audio_path, sample_rate, extractor): (audio, _) = read_audio(audio_path, target_fs=sample_rate) audio = audio / np.max(np.abs(audio)) feature = extractor.transform(audio) return feature
def calculate_logmel(audio_path, sample_rate, feature_extractor): # Read audio (audio, fs) = read_audio(audio_path, target_fs=sample_rate) '''We do not divide the maximum value of an audio here because we assume the low energy of an audio may also contain information of a scene. ''' # Extract feature feature = feature_extractor.transform(audio) return feature
def calculate_multi_logmel(audio_path, sample_rate, feature_extractor): # Read stereo audio (audio, fs) = read_audio(audio_path, target_fs=sample_rate) '''We do not divide the maximum value of an audio here because we assume the low energy of an audio may also contain information of a scene. ''' # Extract feature l_feature = feature_extractor.transform(audio[0]) r_feature = feature_extractor.transform(audio[1]) return np.stack([l_feature, r_feature], axis=0)
def calculate_logmel(audio_path, sample_rate, feature_extractor): # Read audio (audio, fs) = read_audio(audio_path, target_fs=sample_rate) # Normalize energy audio /= np.max(np.abs(audio)) # Extract feature feature = feature_extractor.transform(audio) return feature
def calculate_hpss_logmel(audio_path, sample_rate, feature_extractor): # Read stereo audio (audio, fs) = read_audio(audio_path, target_fs=sample_rate) '''We do not divide the maximum value of an audio here because we assume the low energy of an audio may also contain information of a scene. ''' # Extract feature h, p = librosa.effects.hpss(audio) h_feature = feature_extractor.transform(h) p_feature = feature_extractor.transform(p) return np.stack([h_feature, p_feature], axis=0)
def __getitem__(self, index): # Read audio (audio, fs) = read_audio(self.audio_names[index], target_fs=self.sample_rate) audio /= max(1., np.max(np.abs(audio))) # Cut silence frame_length = 2048 hop_length = 512 threshold = 0.01 energy = librosa.feature.rmse(audio, frame_length=frame_length, hop_length=hop_length, center=True)[0] frames = np.nonzero(energy > threshold)[0] indices = librosa.core.frames_to_samples(frames, hop_length=hop_length) # Abandon too short clips if len(indices) < 2: audio = np.zeros(10000) else: audio = audio[indices[0]:indices[-1]] if len(audio) < 10000: audio = np.zeros(10000) else: audio = audio[0:70000] # To not over use 12 GB GPU RAM # Mu-law _mulaw = mu_law.MuLaw(mu=self.quantize_bins) _quantize = mu_law.Quantize(quantize=self.quantize_bins) audio = _mulaw.transform(audio) audio = _quantize.transform(audio) audio = torch.LongTensor(audio) # Get global condition self.audio_names[index] global_condition = int(self.audio_names[index].split('/')[-1][1:4]) global_condition = torch.tensor(global_condition) return audio, global_condition
def create_mixed_audios(args): """Create mixed audios using the meta from the mixture yaml file. """ # Arguments & parameters dcase2018_task1_dataset_dir = args.dcase2018_task1_dataset_dir dcase2018_task2_dataset_dir = args.dcase2018_task2_dataset_dir workspace = args.workspace scene_type = args.scene_type snr = args.snr sample_rate = config.sample_rate clip_duration = config.clip_duration audio_samples = int(sample_rate * clip_duration) random_state = np.random.RandomState(1234) # Paths mixture_yaml_path = os.path.join(workspace, 'mixture.yaml') out_audios_dir = os.path.join( workspace, 'mixed_audios', 'scene_type={},snr={}'.format(scene_type, snr)) create_folder(out_audios_dir) create_mixed_audio_time = time.time() # Read mixture yaml file with open(mixture_yaml_path, 'r') as f: data_list = yaml.load(f) for n, data in enumerate(data_list): if n % 10 == 0: logging.info('{} / {} mixed audios created' ''.format(n, len(data_list))) if scene_type == 'white_noise': scene_audio = random_state.uniform(0., 1., audio_samples) elif scene_type == 'dcase2018_task1': scene_audio_name = data['scene_audio_name'] scene_audio_path = os.path.join(dcase2018_task1_dataset_dir, 'audio', scene_audio_name) (scene_audio, fs) = read_audio(scene_audio_path, target_fs=sample_rate) # Normalize scene audio scene_audio = normalize_to_energy(scene_audio, db=0) # Reserve space events_audio = np.zeros(audio_samples) # Read sound events audio for event in data['events']: audio_name = event['event_audio_name'] onset = int(event['onset'] * sample_rate) offset = int(event['offset'] * sample_rate) audio_path = os.path.join(dcase2018_task2_dataset_dir, 'audio_train', audio_name) (event_audio, fs) = read_audio(audio_path, target_fs=sample_rate) event_audio = normalize_to_energy(event_audio, db=snr) events_audio[onset:offset] = event_audio[0:offset - onset] stereo_audio = np.array((events_audio, scene_audio)).T '''shape: (samples, 2)''' # Normalize stereo_audio /= np.max(np.abs(stereo_audio)) # Write out audio out_audio_path = os.path.join(out_audios_dir, data['mixture_name']) write_audio(out_audio_path, stereo_audio, sample_rate) logging.info('Write out audio finished! {} s' ''.format(time.time() - create_mixed_audio_time))
def plot_waveform(args): # Arugments & parameters workspace = args.workspace holdout_fold = args.holdout_fold scene_type = args.scene_type snr = args.snr cuda = args.cuda labels = config.labels classes_num = len(labels) sample_rate = config.sample_rate window_size = config.window_size overlap = config.overlap hop_size = window_size-overlap mel_bins = config.mel_bins seq_len = config.seq_len ix_to_lb = config.ix_to_lb thres = 0.1 batch_size = 24 # Paths hdf5_path = os.path.join(workspace, 'features', 'logmel', 'scene_type={},snr={}'.format(scene_type, snr), 'development.h5') yaml_path = os.path.join(workspace, 'mixture.yaml') audios_dir = os.path.join(workspace, 'mixed_audios', 'scene_type={},snr={}'.format(scene_type, snr)) # Load yaml file load_yaml_time = time.time() with open(yaml_path, 'r') as f: meta = yaml.load(f) print('Load yaml file time: {:.3f} s'.format(time.time() - load_yaml_time)) # Data generator generator = InferenceDataGenerator( hdf5_path=hdf5_path, batch_size=batch_size, holdout_fold=holdout_fold) generate_func = generator.generate_validate( data_type='validate', shuffle=False, max_iteration=None) # Evaluate on mini-batch for (iteration, data) in enumerate(generate_func): print(iteration) (batch_x, batch_y, batch_audio_names) = data batch_x = move_data_to_gpu(batch_x, cuda) batch_gt_masks = [] batch_single_gt_masks = [] batch_mixture_stfts = [] for n in range(len(batch_audio_names)): curr_meta = search_meta_by_mixture_name(meta, batch_audio_names[n]) curr_events = curr_meta['events'] gt_indexes = get_ground_truth_indexes(curr_events) gt_sed = get_sed_from_meta(curr_events) # (seq_len, classes_num) (events_stft, scene_stft, mixture_stft) = \ generator.get_events_scene_mixture_stft(batch_audio_names[n]) gt_mask = ideal_ratio_mask(events_stft, scene_stft) # (seq_len, fft_size) gt_masks = gt_mask[:, :, None] * gt_sed[:, None, :] # (seq_len, fft_size, classes_num) gt_masks = gt_masks.astype(np.float32) batch_gt_masks.append(gt_masks) batch_single_gt_masks.append(gt_mask) batch_mixture_stfts.append(mixture_stft) # Plot waveform & spectrogram & ideal ratio mask if True: for n in range(len(batch_x)): print(batch_audio_names[n]) print(batch_y[n]) target_labels = target_to_labels(batch_y[n], labels) print(target_labels) mixed_audio_path = os.path.join(audios_dir, batch_audio_names[n]) (mixed_audio, _) = read_audio(mixed_audio_path, target_fs=config.sample_rate, mono=True) mixed_audio /= np.max(np.abs(mixed_audio)) fig, axs = plt.subplots(3, 1, figsize=(6, 6)) axs[0].plot(mixed_audio) axs[0].set_title('Waveform') axs[0].xaxis.set_ticks([0, len(mixed_audio)]) axs[0].xaxis.set_ticklabels(['0.0', '10.0 s']) axs[0].set_xlim(0, len(mixed_audio)) axs[0].set_ylim(-1, 1) axs[0].set_xlabel('time') axs[0].set_ylabel('Amplitude') axs[1].matshow(np.log(batch_mixture_stfts[n]).T, origin='lower', aspect='auto', cmap='jet') axs[1].set_title('Spectrogram') axs[1].xaxis.set_ticks([0, 310]) axs[1].xaxis.set_ticklabels(['0.0', '10.0 s']) axs[1].xaxis.tick_bottom() axs[1].yaxis.set_ticks([0, 1024]) axs[1].yaxis.set_ticklabels(['0', '1025']) axs[1].set_xlabel('time') axs[1].set_ylabel('FFT bins') axs[2].matshow(batch_single_gt_masks[n].T, origin='lower', aspect='auto', cmap='jet') axs[2].set_title('Ideal ratio mask') axs[2].xaxis.set_ticks([0, 310]) axs[2].xaxis.set_ticklabels(['0.0', '10.0 s']) axs[2].xaxis.tick_bottom() axs[2].yaxis.set_ticks([0, 1024]) axs[2].yaxis.set_ticklabels(['0', '1025']) axs[2].set_xlabel('time') axs[2].set_ylabel('FFT bins') plt.tight_layout() plt.show()
def plot_mel_masks(args): # Arugments & parameters workspace = args.workspace holdout_fold = args.holdout_fold scene_type = args.scene_type snr = args.snr iteration = args.iteration model_type = args.model_type cuda = args.cuda labels = config.labels classes_num = len(labels) sample_rate = config.sample_rate window_size = config.window_size overlap = config.overlap hop_size = window_size-overlap mel_bins = config.mel_bins seq_len = config.seq_len ix_to_lb = config.ix_to_lb thres = 0.1 batch_size = 24 # Paths hdf5_path = os.path.join(workspace, 'features', 'logmel', 'scene_type={},snr={}'.format(scene_type, snr), 'development.h5') model_path = os.path.join(workspace, 'models', 'main_pytorch', 'model_type={}'.format(model_type), 'scene_type={},snr={}' ''.format(scene_type, snr), 'holdout_fold{}'.format(holdout_fold), 'md_{}_iters.tar'.format(iteration)) yaml_path = os.path.join(workspace, 'mixture.yaml') audios_dir = os.path.join(workspace, 'mixed_audios', 'scene_type={},snr={}'.format(scene_type, snr)) sep_wavs_dir = os.path.join(workspace, 'separated_wavs', 'main_pytorch', 'model_type={}'.format(model_type), 'scene_type={},snr={}'.format(scene_type, snr), 'holdout_fold{}'.format(holdout_fold)) create_folder(sep_wavs_dir) # Load yaml file load_yaml_time = time.time() with open(yaml_path, 'r') as f: meta = yaml.load(f) print('Load yaml file time: {:.3f} s'.format(time.time() - load_yaml_time)) feature_extractor = LogMelExtractor( sample_rate=sample_rate, window_size=window_size, overlap=overlap, mel_bins=mel_bins) inverse_melW = feature_extractor.get_inverse_melW() # Load model Model = get_model(model_type) model = Model(classes_num, seq_len, mel_bins, cuda) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) if cuda: model.cuda() # Data generator generator = InferenceDataGenerator( hdf5_path=hdf5_path, batch_size=batch_size, holdout_fold=holdout_fold) generate_func = generator.generate_validate( data_type='validate', shuffle=False, max_iteration=None) # Evaluate on mini-batch for (iteration, data) in enumerate(generate_func): (batch_x, batch_y, batch_audio_names) = data batch_x = move_data_to_gpu(batch_x, cuda) # Predict with torch.no_grad(): model.eval() (batch_output, batch_bottleneck) = model( batch_x, return_bottleneck=True) batch_output = batch_output.data.cpu().numpy() '''(batch_size, classes_num)''' batch_bottleneck = batch_bottleneck.data.cpu().numpy() '''(batch_size, classes_num, seq_len, mel_bins)''' batch_pred_sed = np.mean(batch_bottleneck, axis=-1) batch_pred_sed = np.transpose(batch_pred_sed, (0, 2, 1)) '''(batch_size, seq_len, classes_num)''' batch_gt_masks = [] for n in range(len(batch_audio_names)): curr_meta = search_meta_by_mixture_name(meta, batch_audio_names[n]) curr_events = curr_meta['events'] pred_indexes = np.where(batch_output[n] > thres)[0] gt_indexes = get_ground_truth_indexes(curr_events) gt_sed = get_sed_from_meta(curr_events) # (seq_len, classes_num) pred_sed = np.zeros((seq_len, classes_num)) pred_sed[:, pred_indexes] = batch_pred_sed[n][:, pred_indexes] # (seq_len, classes_num) (events_stft, scene_stft, _) = generator.get_events_scene_mixture_stft(batch_audio_names[n]) events_stft = np.dot(events_stft, feature_extractor.melW) scene_stft = np.dot(scene_stft, feature_extractor.melW) gt_mask = ideal_binary_mask(events_stft, scene_stft) # (seq_len, fft_size) gt_masks = gt_mask[:, :, None] * gt_sed[:, None, :] # (seq_len, fft_size, classes_num) gt_masks = gt_masks.astype(np.float32) batch_gt_masks.append(gt_masks) pred_masks = batch_bottleneck[n].transpose(1, 2, 0) # (seq_len, fft_size, classes_num) # Save out separated audio if True: curr_audio_name = curr_meta['mixture_name'] audio_path = os.path.join(audios_dir, curr_audio_name) (mixed_audio, fs) = read_audio(audio_path, target_fs=sample_rate, mono=True) out_wav_path = os.path.join(sep_wavs_dir, curr_audio_name) write_audio(out_wav_path, mixed_audio, sample_rate) window = np.hamming(window_size) mixed_stft_cmplx = stft(x=mixed_audio, window_size=window_size, hop_size=hop_size, window=window, mode='complex') mixed_stft_cmplx = mixed_stft_cmplx[0 : seq_len, :] mixed_stft = np.abs(mixed_stft_cmplx) for k in gt_indexes: masked_stft = np.dot(pred_masks[:, :, k], inverse_melW) * mixed_stft masked_stft_cmplx = real_to_complex(masked_stft, mixed_stft_cmplx) frames = istft(masked_stft_cmplx) cola_constant = get_cola_constant(hop_size, window) sep_audio = overlap_add(frames, hop_size, cola_constant) sep_wav_path = os.path.join(sep_wavs_dir, '{}_{}.wav'.format(os.path.splitext(curr_audio_name)[0], ix_to_lb[k])) write_audio(sep_wav_path, sep_audio, sample_rate) print('Audio wrote to {}'.format(sep_wav_path)) # Visualize learned representations if True: for n in range(len(batch_output)): # Plot segmentation masks. (00013.wav is used for plot in the paper) print('audio_name: {}'.format(batch_audio_names[n])) print('target: {}'.format(batch_y[n])) target_labels = target_to_labels(batch_y[n], labels) print('target labels: {}'.format(target_labels)) (events_stft, scene_stft, _) = generator.get_events_scene_mixture_stft(batch_audio_names[n]) fig, axs = plt.subplots(7, 7, figsize=(15, 10)) for k in range(classes_num): axs[k // 6, k % 6].matshow(batch_bottleneck[n, k].T, origin='lower', aspect='auto', cmap='jet') if labels[k] in target_labels: color = 'r' else: color = 'k' axs[k // 6, k % 6].set_title(labels[k], color=color) axs[k // 6, k % 6].xaxis.set_ticks([]) axs[k // 6, k % 6].yaxis.set_ticks([]) axs[k // 6, k % 6].set_xlabel('time') axs[k // 6, k % 6].set_ylabel('mel bins') axs[6, 5].matshow(np.log(events_stft + 1e-8).T, origin='lower', aspect='auto', cmap='jet') axs[6, 5].set_title('Spectrogram (in log scale)') axs[6, 5].xaxis.set_ticks([0, 310]) axs[6, 5].xaxis.set_ticklabels(['0.0', '10.0 s']) axs[6, 5].xaxis.tick_bottom() axs[6, 5].yaxis.set_ticks([0, 1024]) axs[6, 5].yaxis.set_ticklabels(['0', '1025']) axs[6, 5].set_xlabel('time') axs[6, 5].set_ylabel('FFT bins') axs[6, 6].matshow(np.log(np.dot(events_stft, feature_extractor.melW) + 1e-8).T, origin='lower', aspect='auto', cmap='jet') axs[6, 6].set_title('Log mel pectrogram') axs[6, 6].xaxis.set_ticks([0, 310]) axs[6, 6].xaxis.set_ticklabels(['0.0', '10.0 s']) axs[6, 6].xaxis.tick_bottom() axs[6, 6].yaxis.set_ticks([0, 63]) axs[6, 6].yaxis.set_ticklabels(['0', '64']) axs[6, 6].set_xlabel('time') axs[6, 6].set_ylabel('mel bins') plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5) plt.show() # Plot frame-wise SED fig, ax = plt.subplots(1, 1, figsize=(4, 4)) score_mat = [] for k in range(classes_num): score = np.mean(batch_bottleneck[n, k], axis=-1) score_mat.append(score) score_mat = np.array(score_mat) ax.matshow(score_mat, origin='lower', aspect='auto', cmap='jet') ax.set_title('Frame-wise predictions') ax.xaxis.set_ticks([0, 310]) ax.xaxis.set_ticklabels(['0.0', '10.0 s']) ax.xaxis.tick_bottom() ax.set_xlabel('time') ax.yaxis.set_ticks(np.arange(classes_num)) ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small') ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3) plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5) plt.show() # Plot event-wise SED est_event_list = get_est_event_list(batch_pred_sed[n:n+1], batch_audio_names[n:n+1], labels) event_mat = event_list_to_matrix(est_event_list) fig, ax = plt.subplots(1, 1, figsize=(4, 4)) ax.matshow(event_mat.T, origin='lower', aspect='auto', cmap='jet') ax.set_title('Event-wise predictions') ax.xaxis.set_ticks([0, 310]) ax.xaxis.set_ticklabels(['0.0', '10.0 s']) ax.xaxis.tick_bottom() ax.set_xlabel('time') ax.yaxis.set_ticks(np.arange(classes_num)) ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small') ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3) plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5) plt.show() # Plot event-wise ground truth ref_event_list = get_ref_event_list(meta, batch_audio_names[n:n+1]) event_mat = event_list_to_matrix(ref_event_list) fig, ax = plt.subplots(1, 1, figsize=(4, 4)) ax.matshow(event_mat.T, origin='lower', aspect='auto', cmap='jet') ax.set_title('Event-wise ground truth') ax.xaxis.set_ticks([0, 310]) ax.xaxis.set_ticklabels(['0.0', '10.0 s']) ax.xaxis.tick_bottom() ax.set_xlabel('time') ax.yaxis.set_ticks(np.arange(classes_num)) ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small') ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3) plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5) plt.show()
def calculate_feature_for_all_audio_files(args): '''Calculate feature of audio files and write out features to a hdf5 file. Args: dataset_dir: string workspace: string data_type: 'train_curated', 'train_noisy', 'test' mini_data: bool, set True for debugging on a small part of data ''' # Arguments & parameters dataset_dir = DATASET_DIR workspace = WORKSPACE data_type = args.data_type mini_data = args.mini_data sample_rate = config.sample_rate window_size = config.window_size hop_size = config.hop_size mel_bins = config.mel_bins fmin = config.fmin fmax = config.fmax frames_per_second = config.frames_per_second lb_to_idx = config.lb_to_idx # Paths if mini_data: prefix = 'minidata_' else: prefix = '' if data_type in ['train_curated', 'train_noisy']: metadata_path = os.path.join(dataset_dir, '{}.csv'.format(data_type)) else: pass audios_dir = os.path.join(dataset_dir, data_type) feature_path = os.path.join( workspace, 'features', '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), '{}.h5'.format(data_type)) create_folder(os.path.dirname(feature_path)) # Read meta data if data_type in ['train_curated', 'train_noisy']: meta_dict = read_metadata(metadata_path, lb_to_idx) elif data_type == 'test': meta_dict = {'audio_name': np.array(sorted(os.listdir(audios_dir)))} # Feature extractor feature_extractor = LogMelExtractor(sample_rate=sample_rate, window_size=window_size, hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax) if mini_data: mini_num = 100 total_num = len(meta_dict['audio_name']) random_state = np.random.RandomState(1234) indexes = random_state.choice(total_num, size=mini_num, replace=False) meta_dict['audio_name'] = meta_dict['audio_name'][indexes] if 'target' in meta_dict: meta_dict['target'] = meta_dict['target'][indexes] # Hdf5 file for storing features and targets print('Extracting features of all audio files ...') extract_time = time.time() audios_num = len(meta_dict['audio_name']) hf = h5py.File(feature_path, 'w') hf.create_dataset( name='audio_name', data=[audio_name.encode() for audio_name in meta_dict['audio_name']], dtype='S20') if 'target' in meta_dict: hf.create_dataset(name='target', data=meta_dict['target'], dtype=np.bool) hf.create_dataset(name='feature', shape=(0, mel_bins), maxshape=(None, mel_bins), dtype=np.float32) hf.create_dataset(name='begin_index', shape=(audios_num, ), dtype=np.int32) hf.create_dataset(name='end_index', shape=(audios_num, ), dtype=np.int32) for (n, audio_name) in enumerate(meta_dict['audio_name']): audio_path = os.path.join(audios_dir, audio_name) print(n, audio_path) # Read audio (audio, _) = read_audio(audio_path=audio_path, target_fs=sample_rate) # Extract feature feature = feature_extractor.transform(audio) print(feature.shape) begin_index = hf['feature'].shape[0] end_index = begin_index + feature.shape[0] hf['feature'].resize((end_index, mel_bins)) hf['feature'][begin_index:end_index, :] = feature hf['begin_index'][n] = begin_index hf['end_index'][n] = end_index hf.close() print('Write hdf5 file to {} using {:.3f} s'.format( feature_path, time.time() - extract_time))
def calculate_feature_for_all_audio_files(args): '''Calculate feature of audio files and write out features to a hdf5 file. Args: dataset_dir: string workspace: string subtask: 'a' | 'b' | 'c' data_type: 'development' | 'evaluation' mini_data: bool, set True for debugging on a small part of data ''' # Arguments & parameters # dataset_dir = args.dataset_dir # workspace = args.workspace # subtask = args.subtask # data_type = args.data_type # mini_data = args.mini_data # test 1 dataset_dir = 'D:/Project/DCASE_test/Data' workspace = 'D:/Project/DCASE_test' subtask = 'a' data_type = 'development' mini_data = True sample_rate = config.sample_rate window_size = config.window_size hop_size = config.hop_size mel_bins = config.mel_bins fmin = config.fmin fmax = config.fmax frames_per_second = config.frames_per_second frames_num = config.frames_num total_samples = config.total_samples lb_to_idx = config.lb_to_idx # Paths if mini_data: prefix = 'minidata_' else: prefix = '' sub_dir = get_subdir(subtask, data_type) audios_dir = os.path.join(dataset_dir, sub_dir, 'audio') if data_type == 'development': metadata_path = os.path.join(dataset_dir, sub_dir, 'meta.csv') elif data_type == 'leaderboard': metadata_path = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 'test.csv') else: raise Exception('Incorrect data_type!') feature_path = os.path.join(workspace, 'features', '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), '{}.h5'.format(sub_dir)) create_folder(os.path.dirname(feature_path)) # Feature extractor feature_extractor = LogMelExtractor( sample_rate=sample_rate, window_size=window_size, hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax) # Read metadata meta_dict = read_metadata(metadata_path) # Extract features and targets if mini_data: mini_num = 20 total_num = len(meta_dict['audio_name']) random_state = np.random.RandomState(1234) indexes = random_state.choice(total_num, size=mini_num, replace=False) for key in meta_dict.keys(): meta_dict[key] = meta_dict[key][indexes] print('Extracting features of all audio files ...') extract_time = time.time() # Hdf5 file for storing features and targets hf = h5py.File(feature_path, 'w') hf.create_dataset( name='audio_name', data=[audio_name.encode() for audio_name in meta_dict['audio_name']], dtype='S80') if 'scene_label' in meta_dict.keys(): hf.create_dataset( name='scene_label', data=[scene_label.encode() for scene_label in meta_dict['scene_label']], dtype='S24') if 'identifier' in meta_dict.keys(): hf.create_dataset( name='identifier', data=[identifier.encode() for identifier in meta_dict['identifier']], dtype='S24') if 'source_label' in meta_dict.keys(): hf.create_dataset( name='source_label', data=[source_label.encode() for source_label in meta_dict['source_label']], dtype='S8') hf.create_dataset( name='feature', shape=(0, frames_num, mel_bins), maxshape=(None, frames_num, mel_bins), dtype=np.float32) for (n, audio_name) in enumerate(meta_dict['audio_name']): audio_path = os.path.join(audios_dir, audio_name) print(n, audio_path) # Read audio (audio, _) = read_audio( audio_path=audio_path, target_fs=sample_rate) # Pad or truncate audio recording to the same length audio = pad_truncate_sequence(audio, total_samples) # Extract feature feature = feature_extractor.transform(audio) # Remove the extra log mel spectrogram frames caused by padding zero feature = feature[0 : frames_num] hf['feature'].resize((n + 1, frames_num, mel_bins)) hf['feature'][n] = feature hf.close() print('Write hdf5 file to {} using {:.3f} s'.format( feature_path, time.time() - extract_time))
def create_mixture_yaml(args): """Create mixture yaml file containing a list of information. Each information looks like: - events: - event_audio_name: 19f45b13.wav event_label: Tambourine offset: 1.22 onset: 0.5 - event_audio_name: 63874688.wav event_label: Scissors offset: 3.38 onset: 3.0 - event_audio_name: cd3e20ec.wav event_label: Computer_keyboard offset: 7.5 onset: 5.5 fold: 1 mixture_name: 00000.wav scene_audio_name: metro_station-barcelona-62-1861-a.wav """ # Arguments & parameters dcase2018_task1_dataset_dir = args.dcase2018_task1_dataset_dir dcase2018_task2_dataset_dir = args.dcase2018_task2_dataset_dir workspace = args.workspace random_state = np.random.RandomState(1234) folds = [1, 2, 3, 4] mixed_audios_per_fold = 2000 events_per_clip = 3 total_events_per_fold = mixed_audios_per_fold * events_per_clip # Paths dcase2018_task1_meta = os.path.join(dcase2018_task1_dataset_dir, 'meta.csv') dcase2018_task2_meta = os.path.join(workspace, 'dcase2018_task2_validate_meta.csv') out_yaml_path = os.path.join(workspace, 'mixture.yaml') create_folder(os.path.dirname(out_yaml_path)) # DCASE 2018 Task 1 acoutic scenes meta df_scenes = pd.read_csv(dcase2018_task1_meta, sep='\t') scene_names = np.array(df_scenes['filename']) random_state.shuffle(scene_names) # DCASE 2018 Task 2 sound events meta df_events = pd.read_csv(dcase2018_task2_meta, sep=',') events_audio_num = len(df_events) acoustic_scene_index = 0 data_list = [] # Calculate mixture meta for fold in folds: # Selected audios indexes bool_selected = (df_events['fold'] == fold) & \ (df_events['manually_verified'] == 1) selected_event_indexes = np.arange(events_audio_num)[bool_selected] repeated_event_indexes = repeat_array(array=selected_event_indexes, max_len=total_events_per_fold, random_state=random_state) for n in range(mixed_audios_per_fold): if acoustic_scene_index % 100 == 0: print('Fold {}, {} / {} mixture infos created' ''.format(fold, acoustic_scene_index, mixed_audios_per_fold * len(folds))) event_indexes_for_one_clip = repeated_event_indexes[ n * events_per_clip:(n + 1) * events_per_clip] events = [] for j, index in enumerate(event_indexes_for_one_clip): event_audio_name = df_events.fname[index] event_label = df_events.label[index] onset = j * 2.5 + 0.5 # Onsets of events are 0.5 s, 3.0 s, # 5.5 s in an audio clip. event_audio_path = os.path.join(dcase2018_task2_dataset_dir, 'audio_train', event_audio_name) (audio, fs) = read_audio(event_audio_path) audio_duration = len(audio) / float(fs) audio_duration = min(audio_duration, 2.0) # Clip maximum # duration to 2.0 s. offset = onset + audio_duration events.append({ 'event_audio_name': event_audio_name, 'event_label': event_label, 'onset': onset, 'offset': offset }) scene_audio_name = scene_names[acoustic_scene_index].split('/')[1] data = { 'mixture_name': '{:05d}.wav'.format(acoustic_scene_index), 'fold': fold, 'events': events, 'scene_audio_name': scene_audio_name } data_list.append(data) acoustic_scene_index += 1 # Write out yaml file with open(out_yaml_path, 'w') as f: yaml.dump(data_list, f, default_flow_style=False) print('Write out mixture yaml to {}'.format(out_yaml_path))
def logmel(args): # Arguments & parameters dataset_dir = args.dataset_dir workspace = args.workspace sample_rate = config.sample_rate window_size = config.window_size overlap = config.overlap seq_len = config.seq_len mel_bins = config.mel_bins # Paths audio_dir = os.path.join(dataset_dir, 'wav') validation_csv_path = os.path.join(workspace, 'validation.csv') hdf5_path = os.path.join(workspace, 'features', 'logmel', 'dev.h5') create_folder(os.path.dirname(hdf5_path)) # Load data df = pd.read_csv(validation_csv_path) df = pd.DataFrame(df) audio_num = len(df) feature_extractor = LogMelExtractor(sample_rate=sample_rate, window_size=window_size, overlap=overlap, mel_bins=mel_bins) begin_time = time.time() # Write out features to hdf5 with h5py.File(hdf5_path, 'w') as hf: dt = h5py.special_dtype(vlen=str) # Reserve space hf.create_dataset(name='feature', shape=(audio_num, seq_len, mel_bins), dtype=np.float32) hf.create_dataset(name='itemid', shape=(audio_num,), dtype='S50') hf.create_dataset(name='datasetid', shape=(audio_num,), dtype='S20') hf.create_dataset(name='hasbird', shape=(audio_num,), dtype=np.int32) hf.create_dataset(name='fold', shape=(audio_num,), dtype=np.int32) n = 0 for row in df.iterrows(): itemid = row[1]['itemid'] datasetid = row[1]['datasetid'] hasbird = row[1]['hasbird'] fold = row[1]['fold'] print(n, itemid) # Calculate feature audio_path = os.path.join(audio_dir, '{}.wav'.format(itemid)) (audio, fs) = read_audio(audio_path, target_fs=sample_rate) feature = feature_extractor.transform(audio) feature = pad_or_trunc(feature, seq_len) hf['feature'][n] = feature hf['itemid'][n] = itemid.encode() hf['datasetid'][n] = datasetid.encode() hf['hasbird'][n] = hasbird hf['fold'][n] = fold if False: print(n, itemid, datasetid, hasbird) plt.matshow(feature.T, origin='lower', aspect='auto', cmap='jet') plt.show() n += 1 print("Write out to {}".format(hdf5_path)) print("Time: {} s".format(time.time() - begin_time))
def calculate_logmel_features(config): # Arguments & parameters workspace = config.workspace sample_rate = config.sr window_size = config.window_size overlap = config.overlap seq_len = config.seq_len mel_bins = config.mel_bins stft_bins = window_size // 2 + 1 classes_num = len(config.labels) lb_to_ix = config.lb_to_ix # Paths audio_dir = config.audio_dir yaml_path = config.out_yaml_path hdf5_path = config.h5_path create_folder(hdf5_path.parents[0]) # # Load yaml load_time = time.time() with open(yaml_path, 'r') as f: data_list = yaml.load(f) logging.info('Loading yaml time: {} s' ''.format(time.time() - load_time)) # Feature extractor feature_extractor = LogMelExtractor(sample_rate=sample_rate, window_size=window_size, overlap=overlap, mel_bins=mel_bins) # Create hdf5 file write_hdf5_time = time.time() hf = h5py.File(hdf5_path, 'w') hf.create_dataset(name='logmel', shape=(0, seq_len, mel_bins), maxshape=(None, seq_len, mel_bins), dtype=np.float32) hf.create_dataset(name='target', shape=(0, classes_num), maxshape=(None, classes_num), dtype=np.int32) audio_names = [] folds = [] item_counts = 0 for n, data in enumerate(data_list): if n % 10 == 0: logging.info('{} / {} audio features calculated' ''.format(n, len(data_list))) audio_path = audio_dir / f'{data["fname"]}' # Read audio (audio, fs) = read_audio(audio_path, target_fs=config.sr, mono=True) for i in range(0, len(audio), config.sr * config.period): start = i end = i + config.sr * config.period audio_segment = audio[start:end] audio_names.append(data['fname'] + "-" + str(i / config.sr)) folds.append(data['fold']) # Extract feature features_dict = calculate_logmel( audio_segment, feature_extractor=feature_extractor) # Write out features hf['logmel'].resize((item_counts + 1, seq_len, mel_bins)) hf['logmel'][item_counts] = features_dict['logmel'] # Write out target target = get_target_from_events(data['events'], lb_to_ix, start / config.sr, end / config.sr) hf['target'].resize((item_counts + 1, classes_num)) hf['target'][item_counts] = target item_counts += 1 hf.create_dataset(name='audio_name', data=[s.encode() for s in audio_names], dtype='S40') hf.create_dataset(name='fold', data=folds, dtype=np.int32) hf.close() logging.info('Write out hdf5 file to {}'.format(hdf5_path)) logging.info('Time spent: {} s'.format(time.time() - write_hdf5_time))
def calculate_feature_for_all_audio_files(args): '''Calculate feature of audio files and write out features to a hdf5 file. Args: dataset_dir: string workspace: string subtask: 'a' | 'b' | 'c' data_type: 'development' | 'evaluation' mini_data: bool, set True for debugging on a small part of data ''' # Arguments & parameters dataset_dir = args.dataset_dir workspace = args.workspace subtask = args.subtask data_type = args.data_type mini_data = args.mini_data sample_rate = config.sample_rate window_size = config.window_size hop_size = config.hop_size mel_bins = config.mel_bins fmin = config.fmin fmax = config.fmax frames_per_second = config.frames_per_second frames_num = config.frames_num total_samples = config.total_samples lb_to_idx = config.lb_to_idx mfcc_frames = config.mfcc_frames n_mfcc = config.n_mfcc mfcc_hop_size = config.mfcc_hop_size gamm_frames = config.gamm_frames n_gamm = config.n_gamm # Paths if mini_data: prefix = 'minidata_' else: prefix = '' sub_dir = get_subdir(subtask, data_type) audios_dir = os.path.join(dataset_dir, sub_dir, 'audio') if data_type == 'development': metadata_path = os.path.join(dataset_dir, sub_dir, 'meta.csv') elif data_type == 'leaderboard': metadata_path = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 'test.csv') elif data_type == 'evaluation': metadata_path = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 'fold1_test.csv') else: raise Exception('Incorrect data_type!') feature_path = os.path.join( workspace, 'features', '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), '{}.h5'.format(sub_dir)) create_folder(os.path.dirname(feature_path)) # Feature extractor feature_extractor = LogMelExtractor(sample_rate=sample_rate, window_size=window_size, hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax) # Read metadata meta_dict = read_metadata(metadata_path) # Extract features and targets if mini_data: mini_num = 10 total_num = len(meta_dict['audio_name']) random_state = np.random.RandomState(1234) indexes = random_state.choice(total_num, size=mini_num, replace=False) for key in meta_dict.keys(): meta_dict[key] = meta_dict[key][indexes] print('Extracting features of all audio files ...') extract_time = time.time() # Hdf5 file for storing features and targets hf = h5py.File(feature_path, 'w') hf.create_dataset( name='audio_name', data=[audio_name.encode() for audio_name in meta_dict['audio_name']], dtype='S80') if 'scene_label' in meta_dict.keys(): hf.create_dataset(name='scene_label', data=[ scene_label.encode() for scene_label in meta_dict['scene_label'] ], dtype='S24') if 'identifier' in meta_dict.keys(): hf.create_dataset(name='identifier', data=[ identifier.encode() for identifier in meta_dict['identifier'] ], dtype='S24') if 'source_label' in meta_dict.keys(): hf.create_dataset(name='source_label', data=[ source_label.encode() for source_label in meta_dict['source_label'] ], dtype='S8') hf.create_dataset(name='feature', shape=(0, total_samples), maxshape=(None, total_samples), dtype=np.float32) hf.create_dataset(name='feature_gamm', shape=(0, gamm_frames, n_gamm), maxshape=(None, gamm_frames, n_gamm), dtype=np.float32) hf.create_dataset(name='feature_mfcc', shape=(0, mfcc_frames, n_mfcc), maxshape=(None, mfcc_frames, n_mfcc), dtype=np.float32) hf.create_dataset(name='feature_panns', shape=(0, 320000), maxshape=(None, 320000), dtype=np.float32) for (n, audio_name) in enumerate(meta_dict['audio_name']): audio_path = os.path.join(audios_dir, audio_name) print(n, audio_path) # Read audio (audio, _) = read_audio(audio_path=audio_path, target_fs=sample_rate) audio = audio[:sample_rate * 10] (audio_gamm, _) = read_audio_gamm(audio_path=audio_path, target_fs=sample_rate) fea_gamm, _ = gtg_in_dB(audio_gamm, sample_rate) fea_gamm = fea_gamm.transpose(1, 0) sound, fs = librosa.load(audio_path) fea_mfcc = librosa.feature.mfcc(y=sound, sr=fs, hop_length=mfcc_hop_size, n_mfcc=n_mfcc) fea_mfcc = fea_mfcc.transpose(1, 0) (waveform, _) = librosa.core.load(audio_path, sr=32000, mono=True) waveform = waveform[:320000] hf['feature'].resize((n + 1, total_samples)) hf['feature'][n] = audio hf['feature_gamm'].resize((n + 1, gamm_frames, n_gamm)) hf['feature_gamm'][n] = fea_gamm hf['feature_mfcc'].resize((n + 1, mfcc_frames, n_mfcc)) hf['feature_mfcc'][n] = fea_mfcc hf['feature_panns'].resize((n + 1, 320000)) hf['feature_panns'][n] = waveform hf.close() print('Write hdf5 file to {} using {:.3f} s'.format( feature_path, time.time() - extract_time))
def calculate_feature_for_all_audio_files(args): '''Calculate feature of audio files and write out features to a single hdf5 file. Args: dataset_dir: string workspace: string data_type: 'development' | 'evaluation' mini_data: bool, set True for debugging on a small part of data ''' # Arguments & parameters dataset_dir = args.dataset_dir workspace = args.workspace data_type = args.data_type mini_data = args.mini_data sample_rate = config.sample_rate window_size = config.window_size hop_size = config.hop_size mel_bins = config.mel_bins fmin = config.fmin fmax = config.fmax frames_per_second = config.frames_per_second frames_num = config.frames_num total_samples = config.total_samples classes_num = config.classes_num lb_to_idx = config.lb_to_idx # Paths if mini_data: prefix = 'minidata_' else: prefix = '' relative_name = get_relative_path_no_extension(data_type) audios_dir = os.path.join(dataset_dir, 'audio', relative_name) if data_type == 'validation': metadata_path = os.path.join(dataset_dir, 'metadata', 'validation', '{}.csv'.format(relative_name)) else: metadata_path = os.path.join(dataset_dir, 'metadata', '{}.csv'.format(relative_name)) feature_path = os.path.join( workspace, 'features', '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), '{}.h5'.format(relative_name)) create_folder(os.path.dirname(feature_path)) # Feature extractor feature_extractor = LogMelExtractor(sample_rate=sample_rate, window_size=window_size, hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax) # Read metadata (data_dict, has_weak_labels, has_strong_labels) = read_metadata(metadata_path) # Extract features and targets audio_names = sorted([*data_dict.keys()]) if mini_data: random_state = np.random.RandomState(1234) random_state.shuffle(audio_names) audio_names = audio_names[0:10] print('Extracting features of all audio files ...') extract_time = time.time() # Hdf5 file for storing features and targets hf = h5py.File(feature_path, 'w') hf.create_dataset(name='audio_name', data=[audio_name.encode() for audio_name in audio_names], dtype='S64') hf.create_dataset(name='feature', shape=(0, frames_num, mel_bins), maxshape=(None, frames_num, mel_bins), dtype=np.float32) if has_weak_labels: hf.create_dataset(name='weak_target', shape=(0, classes_num), maxshape=(None, classes_num), dtype=np.bool) if has_strong_labels: hf.create_dataset(name='strong_target', shape=(0, frames_num, classes_num), maxshape=(None, frames_num, classes_num), dtype=np.bool) for (n, audio_name) in enumerate(audio_names): audio_path = os.path.join(audios_dir, audio_name) print(n, audio_path) # Read audio (audio, _) = read_audio(audio_path=audio_path, target_fs=sample_rate) # Pad or truncate audio recording audio = pad_truncate_sequence(audio, total_samples) # Extract feature feature = feature_extractor.transform(audio) # Remove the extra frames caused by padding zero feature = feature[0:frames_num] hf['feature'].resize((n + 1, frames_num, mel_bins)) hf['feature'][n] = feature if has_weak_labels: weak_labels = data_dict[audio_name]['weak_labels'] hf['weak_target'].resize((n + 1, classes_num)) hf['weak_target'][n] = labels_to_target(weak_labels, classes_num, lb_to_idx) if has_strong_labels: events = data_dict[audio_name]['strong_labels'] hf['strong_target'].resize((n + 1, frames_num, classes_num)) hf['strong_target'][n] = events_to_target( events=events, frames_num=frames_num, classes_num=classes_num, frames_per_second=frames_per_second, lb_to_idx=lb_to_idx) hf.close() print('Write hdf5 file to {} using {:.3f} s'.format( feature_path, time.time() - extract_time))
def calculate_feature_for_all_audio_files(args): '''Calculate feature of audio files and write out features to a single hdf5 file. Args: dataset_dir: string workspace: string data_type: 'train' | 'validate' | 'evaluate' mini_data: bool, set True for debugging on a small part of data ''' # Arguments & parameters dataset_dir = args.dataset_dir data_type = args.data_type workspace = args.workspace mini_data = args.mini_data sample_rate = config.sample_rate window_size = config.window_size hop_size = config.hop_size mel_bins = config.mel_bins fmin = config.fmin fmax = config.fmax frames_per_second = config.frames_per_second frames_num = config.frames_num total_samples = config.total_samples # Paths if mini_data: prefix = 'minidata_' else: prefix = '' metadata_path = os.path.join(dataset_dir, 'annotations.csv') if data_type in ['train', 'validate']: audios_dir = os.path.join(dataset_dir, data_type) elif data_type == 'evaluate': audios_dir = os.path.join(dataset_dir, 'audio-eval') feature_path = os.path.join(workspace, 'features', '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), '{}.h5'.format(data_type)) create_folder(os.path.dirname(feature_path)) # Feature extractor feature_extractor = LogMelExtractor( sample_rate=sample_rate, window_size=window_size, hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax) # Read metadata print('Extracting features of all audio files ...') extract_time = time.time() if data_type in ['train', 'validate']: meta_dict = read_metadata(metadata_path, data_type, mini_data) elif data_type == 'evaluate': meta_dict = read_evaluate_metadata(audios_dir, mini_data) # Hdf5 containing features and targets hf = h5py.File(feature_path, 'w') hf.create_dataset( name='audio_name', data=[audio_name.encode() for audio_name in meta_dict['audio_name']], dtype='S32') if 'fine_target' in meta_dict.keys(): hf.create_dataset( name='fine_target', data=meta_dict['fine_target'], dtype=np.float32) if 'coarse_target' in meta_dict.keys(): hf.create_dataset( name='coarse_target', data=meta_dict['coarse_target'], dtype=np.float32) hf.create_dataset( name='feature', shape=(0, frames_num, mel_bins), maxshape=(None, frames_num, mel_bins), dtype=np.float32) for (n, audio_name) in enumerate(meta_dict['audio_name']): audio_path = os.path.join(audios_dir, audio_name) print(n, audio_path) # Read audio (audio, _) = read_audio( audio_path=audio_path, target_fs=sample_rate) # Pad or truncate audio recording audio = pad_truncate_sequence(audio, total_samples) # Extract feature feature = feature_extractor.transform(audio) # Remove the extra frames caused by padding zero feature = feature[0 : frames_num] hf['feature'].resize((n + 1, frames_num, mel_bins)) hf['feature'][n] = feature hf.close() print('Write hdf5 file to {} using {:.3f} s'.format( feature_path, time.time() - extract_time))
def create_mixed_audio(args): # Arguments & parameters dcase2018_task1_dataset_dir = args.dcase2018_task1_dataset_dir dcase2018_task2_dataset_dir = args.dcase2018_task2_dataset_dir workspace = args.workspace scene_type = args.scene_type snr = args.snr sample_rate = config.sample_rate clip_duration = 10. audio_len = int(sample_rate * clip_duration) random_state = np.random.RandomState(1234) # Paths yaml_path = os.path.join(workspace, 'yaml_files', 'mixture.yaml') out_audio_dir = os.path.join( workspace, 'mixed_audios', 'scene_type={},snr={}'.format(scene_type, snr)) create_folder(out_audio_dir) with open(yaml_path, 'r') as f: data = yaml.load(f) create_audio_time = time.time() for n in range(len(data)): if n % 10 == 0: logging.info(n) if scene_type == 'white_noise': scene_audio = random_state.uniform(0., 1., audio_len) elif scene_type == 'dcase2018_task1': scene_audio_name = data[n]['scene_audio_name'] scene_audio_path = os.path.join(dcase2018_task1_dataset_dir, 'audio', scene_audio_name) (scene_audio, fs) = read_audio(scene_audio_path, target_fs=sample_rate) # Normalize scene audio scene_audio = normalize_to_energy(scene_audio, db=-snr) # Reserve space events_audio = np.zeros(audio_len) for (j, event) in enumerate(data[n]['events']): audio_name = event['event_audio_name'] onset = int(event['onset'] * sample_rate) offset = int(event['offset'] * sample_rate) audio_path = os.path.join(dcase2018_task2_dataset_dir, 'audio_train', audio_name) (audio, fs) = read_audio(audio_path, target_fs=sample_rate) audio = normalize_to_energy(audio, db=0.) events_audio[onset:offset] = audio[0:offset - onset] stereo_audio = np.array((events_audio, scene_audio)).T stereo_audio /= np.max(np.abs(stereo_audio)) out_audio_path = os.path.join(out_audio_dir, data[n]['mixture_name']) write_audio(out_audio_path, stereo_audio, sample_rate) logging.info('Write out audio finished! {} s'.format(time.time() - create_audio_time))
def create_mixture_yaml(args): # Arguments & parameters dcase2018_task1_dataset_dir = args.dcase2018_task1_dataset_dir dcase2018_task2_dataset_dir = args.dcase2018_task2_dataset_dir workspace = args.workspace random_state = np.random.RandomState(1234) folds = [1, 2, 3, 4] mixed_audios_num = 2000 events_per_clip = 3 # Paths dcase2018_task1_meta = os.path.join(dcase2018_task1_dataset_dir, 'meta.csv') dcase2018_task2_meta = os.path.join(workspace, 'dcase2018_task2_validate_meta.csv') yaml_path = os.path.join(workspace, 'mixture.yaml') create_folder(os.path.dirname(yaml_path)) # Scenes meta df_scenes = pd.read_csv(dcase2018_task1_meta, sep='\t') scene_names = np.array(df_scenes['filename']) random_state.shuffle(scene_names) # Events meta df_events = pd.read_csv(dcase2018_task2_meta, sep=',') # count = 0 data_list = [] for fold in folds: bool_selected = (df_events['fold'] == fold) & (df_events['manually_verified'] == 1) event_audio_names = np.array(df_events.fname[bool_selected]) event_labels = np.array(df_events.label[bool_selected]) indexes = np.arange(len(event_audio_names)) repeated_indexes = repeat_indexes(indexes, mixed_audios_num * events_per_clip, random_state) for n in range(mixed_audios_num): if count % 100 == 0: print(count) current_idxes = repeated_indexes[n * events_per_clip:(n + 1) * events_per_clip] events = [] for (j, idx) in enumerate(current_idxes): event_audio_name = event_audio_names[idx] event_label = event_labels[idx] onset = j * 2.5 + 0.5 audio_path = os.path.join(dcase2018_task2_dataset_dir, 'audio_train', event_audio_name) (audio, fs) = read_audio(audio_path) audio_duration = len(audio) / float(fs) audio_duration = min(audio_duration, 2.0) offset = onset + audio_duration events.append({ 'event_audio_name': event_audio_name, 'event_label': event_label, 'onset': onset, 'offset': offset }) scene_audio_name = scene_names[count].split('/')[1] data = { 'mixture_name': '{:05d}.wav'.format(count), 'fold': fold, 'events': events, 'scene_audio_name': scene_audio_name } data_list.append(data) count += 1 # if count == 30: # break with open(yaml_path, 'w') as f: f.write(yaml.dump(data_list, default_flow_style=False))