def process_syllable(syl, hparams, mel_basis, debug): # Skip silences syl_len = len(syl) if syl_len == 0: return None, None, None if np.max(syl) == 0: return None, None, None # If too long skip, else pad if syl_len > hparams.chunk_len_samples: return None, None, None else: syl_pad = np.zeros((hparams.chunk_len_samples)) syl_pad[:syl_len] = syl # Normalise sn = syl_pad / np.max(syl_pad) # convert to float if type(sn[0]) == int: sn = int16_to_float32(sn) # create spec mS, debug_info = spectrogram_sp(y=sn, sr=hparams.sr, n_fft=hparams.n_fft, win_length=hparams.win_length_samples, hop_length=hparams.hop_length_samples, ref_level_db=hparams.ref_level_db, _mel_basis=mel_basis, pre_emphasis=hparams.preemphasis, power=hparams.power, debug=debug) return sn, mS, debug_info
def prepare_wav(wav_loc, hparams=None): """ load wav and convert to correct format """ # get rate and date rate, data = load_wav(wav_loc) # convert data if needed if np.issubdtype(type(data[0]), np.integer): data = int16_to_float32(data) # bandpass filter if hparams is not None: data = butter_bandpass_filter(data, hparams.butter_lowcut, hparams.butter_highcut, rate, order=5) # reduce noise if hparams.reduce_noise: data = nr.reduce_noise(audio_clip=data, noise_clip=data, **hparams.noise_reduce_kwargs) return rate, data
def make_spec( syll_wav, fs, hparams, mel_matrix=None, use_tensorflow=False, use_mel=True, return_tensor=False, norm_uint8=False, ): """ """ if use_tensorflow: import tensorflow as tf from avgn.signalprocessing.spectrogramming_tf import spectrogram_tensorflow # convert to float if type(syll_wav[0]) == int: syll_wav = int16_to_float32(syll_wav) # create spec if use_tensorflow: spec = spectrogram_tensorflow(syll_wav, fs, hparams) if use_mel: spec = tf.transpose(tf.tensordot(spec, mel_matrix, 1)) if not return_tensor: spec = spec.numpy() else: spec = spectrogram(syll_wav, fs, hparams) if use_mel: spec = np.dot(spec.T, mel_matrix).T if norm_uint8: spec = (norm(spec) * 255).astype("uint8") return spec
def get_element(datafile, indv=None, element_number=1, element="syllable", hparams=None): # if an individual isnt specified, grab the first one if indv == None: indv = datafile.indvs[0] # get the element element = datafile.data["indvs"][indv][element] # get the part of the wav we want to load st = element["start_times"][element_number] et = element["end_times"][element_number] # load the data rate, element = load_wav(datafile.data["wav_loc"], offset=st, duration=et - st, sr=None) if np.issubdtype(type(element[0]), np.integer): element = int16_to_float32(data) if hparams is not None: element = butter_bandpass_filter(element, hparams.butter_lowcut, hparams.butter_highcut, rate, order=5) return rate, element
def prepare_wav(wav_loc, hparams, debug): """ load wav and convert to correct format """ if debug: debug_data = {} else: debug_data = None # get rate and date data, _ = librosa.load(wav_loc, sr=hparams.sr) # convert data if needed if np.issubdtype(type(data[0]), np.integer): data = int16_to_float32(data) # Chunks to avoid memory issues len_chunk_minutes = 10 len_chunk_sample = hparams.sr * 60 * len_chunk_minutes data_chunks = [] for t in range(0, len(data), len_chunk_sample): start = t end = min(len(data), t + len_chunk_sample) data_chunks.append(data[start:end]) # only keep one chunk for debug if debug: break # bandpass filter data_cleaned = [] if hparams is not None: for data in data_chunks: if debug: debug_data['x'] = data data = butter_bandpass_filter(data, hparams.butter_lowcut, hparams.butter_highcut, hparams.sr, order=5) if debug: debug_data['x_filtered'] = data # reduce noise if hparams.reduce_noise: data = nr.reduce_noise(audio_clip=data, noise_clip=data, **hparams.noise_reduce_kwargs) if debug: debug_data['x_rn'] = data data_cleaned.append(data) else: data_cleaned = data_chunks # concatenate chunks data = np.concatenate(data_cleaned) return data, debug_data
def subset_syllables(json_dict, indv, unit="syllables", hparams=None, include_labels=True): """ Grab syllables from wav data """ if type(indv) == list: indv = indv[0] if type(json_dict) != OrderedDict: json_dict = read_json(json_dict) # get unit info start_times = json_dict["indvs"][indv][unit]["start_times"] # stop times vs end_times is a quick fix that should be fixed on the parsing side if "end_times" in json_dict["indvs"][indv][unit].keys(): end_times = json_dict["indvs"][indv][unit]["end_times"] else: end_times = json_dict["indvs"][indv][unit]["stop_times"] if include_labels: labels = json_dict["indvs"][indv][unit]["labels"] else: labels = None # get rate and date rate, data = load_wav(json_dict["wav_loc"]) # convert data if needed if np.issubdtype(type(data[0]), np.integer): data = int16_to_float32(data) # bandpass filter if hparams is not None: data = butter_bandpass_filter(data, hparams.butter_lowcut, hparams.butter_highcut, rate, order=5) # reduce noise if hparams.reduce_noise: data = nr.reduce_noise(audio_clip=data, noise_clip=data, **hparams.noise_reduce_kwargs) syllables = [ data[int(st * rate):int(et * rate)] for st, et in zip(start_times, end_times) ] return syllables, rate, labels
def process_bird_wav( bird, wav_info, wav_time, params, save_to_folder, visualize=False, skip_created=False, seconds_timeout=300, save_spectrograms=True, verbose=False, ): """splits a wav file into periods of silence and periods of sound based on params """ # Load up the WAV rate, data = load_wav(wav_info) params["sample_rate"] = rate if rate is None or data is None: return # bandpass filter data = butter_bandpass_filter(data.astype("float32"), params["lowcut"], params["highcut"], rate, order=2) data = float32_to_int16(data) # we only want one channel if len(np.shape(data)) == 2: data = data[:, 0] # threshold the (root mean squared of the) audio rms_data, sound_threshed = RMS( data, rate, params["rms_stride"], params["rms_window"], params["rms_padding"], params["noise_thresh"], ) # Find the onsets/offsets of sound onset_sounds, offset_sounds = detect_onsets_offsets( np.repeat(sound_threshed, int(params["rms_stride"] * rate)), threshold=0, min_distance=0, ) # make sure all onset sounds are at least zero (due to downsampling in RMS) onset_sounds[onset_sounds < 0] = 0 # threshold clips of sound for onset_sound, offset_sound in zip(onset_sounds, offset_sounds): # segment the clip clip = data[onset_sound:offset_sound] ### if the clip is thresholded, as noise, do not save it into dataset # bin width in Hz of spectrogram freq_step_size_Hz = (rate / 2) / params["num_freq"] bout_spec = threshold_clip(clip, rate, freq_step_size_Hz, params, visualize=visualize, verbose=verbose) if bout_spec is None: # visualize spectrogram if desired if visualize: # compute spectrogram of clip wav_spectrogram = spectrogram(int16_to_float32(clip), params) visualize_spec(wav_spectrogram, show=True) continue # determine the datetime of this clip start_time = wav_time + timedelta(seconds=onset_sound / float(rate)) time_string = start_time.strftime("%Y-%m-%d_%H-%M-%S-%f") # create a subfolder for the individual bird if it doesn't already exist bird_folder = Path(save_to_folder).resolve() / bird ensure_dir(bird_folder) # save data save_bout_wav(data, rate, bird_folder, bird, wav_info, time_string, skip_created) # save the spectrogram of the data if save_spectrograms: save_bout_spec(bird_folder, bout_spec, time_string, skip_created)
def threshold_clip(clip, rate, freq_step_size_Hz, params, visualize=False, verbose=False): """ determines if a clip is a bout, or noise based on threshold parameters """ # get the length of the segment segment_length = len(clip) / float(rate) # return if the clip is the wrong length if segment_length <= params["min_segment_length_s"]: if verbose: print("Segment length {} less than minimum of {}".format( segment_length, params["min_segment_length_s"])) return if segment_length >= params["max_segment_length_s"]: if verbose: print("Segment length {} greather than maximum of {}".format( segment_length, params["max_segment_length_s"])) return # compute spectrogram of clip wav_spectrogram = spectrogram(int16_to_float32(clip), params) # determine the power of the spectral envelope norm_power = np.mean(wav_spectrogram, axis=0) norm_power = (norm_power - np.min(norm_power)) / (np.max(norm_power) - np.min(norm_power)) # get the maximum power region of the frequency envelope peak_power_Hz = np.argmax(norm_power) * freq_step_size_Hz # threshold for the location of peak power if peak_power_Hz < params["vocal_range_Hz"][0]: if verbose: print("Peak power {} Hz less than minimum of {}".format( peak_power_Hz, params["vocal_range_Hz"][0])) return # threshold based on silence vocal_power = zero_one_norm( np.sum( wav_spectrogram[:, int(params["vocal_range_Hz"][0] / freq_step_size_Hz ):int(params["vocal_range_Hz"][1] / freq_step_size_Hz), ], axis=1, )) # the percent of the spectrogram below the noise threshold pct_silent = np.sum(vocal_power <= params["noise_thresh"]) / float( len(vocal_power)) if pct_silent < params["min_silence_pct"]: if verbose: print("Percent silent {} /% less than maximum of {}".format( pct_silent, params["min_silence_pct"])) return if visualize: visualize_spec(wav_spectrogram, show=True) # compute threshold statistics return wav_spectrogram