def save_mel(self, wav_path, basename, key_name, mel_path): if not os.path.isfile("{}/{}_{}.png".format(mel_path, os.path.basename(basename), key_name)): (audio, _) = read_audio(wav_path, cfg.sample_rate) ham_win = np.hamming(cfg.n_window) spec = librosa.stft( audio, n_fft=cfg.n_window, hop_length=cfg.hop_length, window=ham_win, center=True, pad_mode='reflect' ) mel_spec = librosa.feature.melspectrogram( S=np.abs(spec), # amplitude, for energy: spec**2 but don't forget to change amplitude_to_db. sr=cfg.sample_rate, n_mels=cfg.n_mels, fmin=cfg.f_min, fmax=cfg.f_max, htk=False, norm=None) mel_spec = librosa.amplitude_to_db(mel_spec) # 10 * log10(S**2 / ref), ref default is 1 # mel_spec = mel_spec.T mel_spec = mel_spec.astype(np.float32) librosa.display.specshow(mel_spec, x_axis='frames', y_axis='hz') plt.title("{}_{}".format(basename, key_name)) cb = plt.colorbar() cb.set_label("db") plt.savefig("{}/{}_{}.png".format(mel_path, os.path.basename(basename), key_name)) plt.close() print("save_mel:{}".format(basename))
def extract_features_from_meta(self, csv_audio, feature_dir, subpart_data=None): """Extract log mel spectrogram features. Args: csv_audio : str, file containing names, durations and labels : (name, start, end, label, label_index) the associated wav_filename is Yname_start_end.wav feature_dir: str, the path to the directory where the features are subpart_data: int, number of files to extract features from the csv. """ t1 = time.time() df_meta = self.get_df_from_meta(csv_audio, subpart_data) LOG.info("{} Total file number: {}".format( csv_audio, len(df_meta.filename.unique()))) for ind, wav_name in enumerate(df_meta.filename.unique()): if ind % 500 == 0: LOG.debug(ind) wav_dir = self.get_audio_dir_path_from_meta(csv_audio) wav_path = os.path.join(wav_dir, wav_name) out_filename = os.path.join(feature_dir, name_only(wav_name) + ".npy") if not os.path.exists(out_filename): if not os.path.isfile(wav_path): LOG.error( "File %s is in the csv file but the feature is not extracted!" % wav_path) df_meta = df_meta.drop( df_meta[df_meta.filename == wav_name].index) else: (audio, _) = read_audio(wav_path, cfg.sample_rate) if audio.shape[0] == 0: print("File %s is corrupted!" % wav_path) else: mel_spec = self.calculate_mel_spec( audio, log_feature=self.save_log_feature) np.save(out_filename, mel_spec) LOG.debug("compute features time: %s" % (time.time() - t1)) return df_meta.reset_index(drop=True)
def get_features(self, wav_path, feature_dir, frames): (audio, _) = read_audio(wav_path, cfg.sample_rate) mel_spec = self.calculate_mel_spec(audio, log_feature=self.save_log_feature) # Trunc the data so it is a multiple of frames. Just change the nb of frames # if you want padding instead if frames > mel_spec.shape[0]: pad_trunc_length = frames else: pad_trunc_length = mel_spec.shape[0] - mel_spec.shape[0] % frames mel_spec = pad_trunc_seq(mel_spec, pad_trunc_length) # Reshape in multiple segments and save them mel_spec_frames = mel_spec.reshape(-1, frames, mel_spec.shape[-1]) out_filenames = [] wav_name = os.path.basename(wav_path) for cnt, sample in enumerate(mel_spec_frames): out_filename = os.path.join(feature_dir, name_only(wav_name)) + "fr" + str(frames) + "_" + \ str(cnt * frames) + "-" + str((cnt + 1) * frames) + ".npy" np.save(out_filename, sample) out_filenames.append(out_filename) cnt_max = len(mel_spec_frames) return audio, cnt_max
def extract_features_from_meta(self, csv_audio, subpart_data=None, training=False): """Extract log mel spectrogram features. Args: csv_audio : str, file containing names, durations and labels : (name, start, end, label, label_index) the associated wav_filename is Yname_start_end.wav subpart_data: int, number of files to extract features from the csv. """ t1 = time.time() df_meta = self.get_df_from_meta(csv_audio, subpart_data) df_all = list() feature_fns = list() LOG.info('Extracting/loading features') LOG.info("{} Total file number: {}".format( csv_audio, len(df_meta.filename.unique()))) augmentation_funcs = [ ('orig', None), # original signal ] if training: augmentation_funcs += [ # ('lpf4k', partial(lpf, wc=4000, fs=cfg.sample_rate)), # ('lpf8k', partial(lpf, wc=8000, fs=cfg.sample_rate)), # ('lpf16k', partial(lpf, wc=16000, fs=cfg.sample_rate)), # ('ps-6', partial(pitch_shift, sr=cfg.sample_rate, n_steps=-6)), # ('ps-3', partial(pitch_shift, sr=cfg.sample_rate, n_steps=-3)), # ('ps+3', partial(pitch_shift, sr=cfg.sample_rate, n_steps=3)), # ('ps+6', partial(pitch_shift, sr=cfg.sample_rate, n_steps=6)), # ('ts1.25', partial(time_stretch, rate=1.25)), # ('ts1.5', partial(time_stretch, rate=1.5)), # ('amp0.5', partial(amplitude_scale, coeff=0.5)), # ('amp0.75', partial(amplitude_scale, coeff=0.75)), # ('hp0.25', partial(hp_reweight, lam=0.25)), # ('hp0.75', partial(hp_reweight, lam=0.75)) ] wav_fns = df_meta.filename.unique() flag = False for ind, wav_name in tqdm(enumerate(wav_fns), total=len(wav_fns)): if ind % 500 == 0: LOG.debug(ind) # verify the audio file is present wav_dir = self.get_audio_dir_path_from_meta(csv_audio) wav_path = os.path.join(wav_dir, wav_name) if os.path.isfile(wav_path): # defer loading audio until the need for feature extraction is verified audio = None # perform all augmentations (including no augmentation) for name, func in augmentation_funcs: if name == 'orig': out_filename = os.path.splitext(wav_name)[0] + ".npy" else: out_filename = os.path.splitext( wav_name)[0] + '_' + name + ".npy" out_path = os.path.join(self.feature_dir, out_filename) # add the metadata meta = df_meta.loc[df_meta.filename == wav_name] df_all.append(meta) # for synthetic data with time annotation of events, the meta df will have several entries for # each wav file. therefore, we need to append the feature filename len(meta) times. if len(meta) > 1: feature_fns += [out_filename] * len(meta) if flag: print('Length of meta: {}'.format(len(meta))) flag = False else: feature_fns.append(out_filename) if not os.path.exists(out_path): if audio is None: (audio, _) = read_audio(wav_path, cfg.sample_rate) if audio.shape[0] == 0: print("File %s is corrupted!" % wav_path) del feature_fns[-1] del df_all[-1] # perform any augmentation, extract features, save features # LOG.info('extracting {}'.format(out_filename)) if func is not None: mel_spec = self.calculate_mel_spec(func(audio)) else: mel_spec = self.calculate_mel_spec(audio) np.save(out_path, mel_spec) LOG.debug("compute features time: %s" % (time.time() - t1)) else: LOG.error( "File %s is in the csv file but the feature is not extracted!" % wav_path) # df_meta = df_meta.drop(df_meta[df_meta.filename == wav_name].index) # form the final DataFrame of meta data for features from original and augmented audio df_all = pd.concat(df_all).reset_index(drop=True) df_all['feature_filename'] = feature_fns return df_all
def extract_features_from_meta_segment(self, csv_audio, feature_dir, subpart_data=None, fixed_segment=None): """Extract log mel spectrogram features, but the csv needs to be strongly labeled. Args: csv_audio : str, file containing names, durations and labels : (name, start, end, label, label_index) the associated wav_filename is Yname_start_end.wav feature_dir: str, the path of the features directory. subpart_data: int, number of files to extract features from the csv. fixed_segment: float, in seconds, the size of the kept segment. If >audio length, the audio length is kept. If segment is True, and >label, it takes the surrounding (allow creating weak labels). """ t1 = time.time() df_meta = self.get_df_from_meta(csv_audio, subpart_data) self.get_classes(df_meta) LOG.info("{} Total file number: {}".format( csv_audio, len(df_meta.filename.unique()))) ext_name = "_segment_" if subpart_data: ext_name += str(subpart_data) if fixed_segment is not None: LOG.debug( f" durations before: " f"{df_meta.groupby('event_label').apply(lambda x: (x.offset - x.onset).mean())}" ) ext_name += f"fix{fixed_segment}" df_meta = self.trunc_pad_segment(df_meta, fixed_segment) LOG.debug( f" durations after: " f"{df_meta.groupby('event_label').apply(lambda x: (x.offset - x.onset).mean())}" ) meta_base, meta_ext = os.path.splitext(csv_audio.split("/")[-1]) csv_features = os.path.join(self.metadata_dir, meta_base + ext_name + meta_ext) wav_dir = self.get_audio_dir_path_from_meta(csv_audio) df_features = pd.DataFrame() path_exists = os.path.exists(csv_features) if not path_exists: # Loop in all the filenames for ind, wav_name in enumerate(df_meta.filename.unique()): if ind % 500 == 0: LOG.debug(ind) wav_path = os.path.join(wav_dir, wav_name) if not os.path.isfile(wav_path): LOG.error( "File %s is in the csv file but the feature is not extracted, deleting...!" % wav_path) df_meta = df_meta.drop( df_meta[df_meta.filename == wav_name].index) else: try: audio_len_sec = soundfile.info(wav_path).duration except Exception as e: print("File %s is corrupted, not added to df!" % wav_path) print(e) continue if audio_len_sec == 0: print("File %s is corrupted, not added to df!" % wav_path) else: files_exist = True # How many features we can compute from this file ? sub_df = df_meta[df_meta.filename == wav_name] cnt_max = len(sub_df) if cnt_max == 0: break base_wav_name = name_only(wav_name) ext_featname = "_seg" if fixed_segment: ext_featname += f"fix{fixed_segment}" files_exist = False # We should always recompute because of the randomness of onset offset # Check if files already exist out_filenames = [ base_wav_name + ext_featname + str(cnt) + ".npy" for cnt in range(cnt_max) ] for fname in out_filenames: fpath = os.path.join(feature_dir, fname) if not os.path.exists(fpath): files_exist = False break add_item = { "raw_filename": [], "filename": [], "event_labels": [] } for ii, (i, row) in enumerate(sub_df.iterrows()): if not pd.isna(row.event_label): if ii > 0: extnb = str(ii) else: extnb = "" out_filename = os.path.join( feature_dir, name_only(wav_name)) out_filename += ext_featname + extnb + ".npy" if not files_exist: sr = soundfile.info(wav_path).samplerate (audio, _) = read_audio(wav_path, cfg.sample_rate, start=int(row.onset * sr), stop=int(row.offset * sr)) mel_spec = self.calculate_mel_spec( audio, log_feature=self.save_log_feature) if fixed_segment: pad_trunc_length = int( fixed_segment * cfg.sample_rate // cfg.hop_length) mel_spec = pad_trunc_seq( mel_spec, pad_trunc_length) np.save(out_filename, mel_spec) add_item["raw_filename"].append(wav_name) add_item["filename"].append(out_filename) add_item["event_labels"].append( row["event_label"]) df_features = df_features.append( pd.DataFrame(add_item), ignore_index=True) df_features.to_csv(csv_features, sep="\t", header=True, index=False) df_features = pd.read_csv( csv_features, sep="\t") # Otherwise event_labels is "" and not NaN else: df_features = self.get_df_from_meta( csv_features) # No subpart data because should be in the name LOG.debug("compute features time: %s" % (time.time() - t1)) return df_features