def shorter_waveform_to_examples(data): """ Compute the spectrogram for each short audios Input: short audio data Output: list of spectrograms in this short audio, eahch with params.EXAMPLE_WINDOW_SECONDS, hopped by params.EXAMPLE_HOP_SECONDS """ # Compute log mel spectrogram features for each short audios log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=params.SAMPLE_RATE, log_offset=params.LOG_OFFSET, window_length_secs=params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=params.NUM_MEL_BINS, # here forced the num_mel_bins lower_edge_hertz=params.MEL_MIN_HZ, upper_edge_hertz=params.MEL_MAX_HZ) #(data.shape[0]/params.SAMPLE_RATE*1000-25)/10+1 FRAMES x num_mel_bins # Frame features into examples # Each example is [100x513]->[100x64bins] (non-overlapping) features_sample_rate = 1.0 / params.STFT_HOP_LENGTH_SECONDS #frames every second example_window_length = int(round(params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round(params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
def wavfile_to_examples(wav_file): sample_rate, wav_data = wavfile.read(wav_file) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype data = wav_data / 32768.0 # Convert to [-1.0, +1.0] # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != SAMPLE_RATE: data = resampy.resample(data, sample_rate, SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram(data, audio_sample_rate= SAMPLE_RATE, log_offset= LOG_OFFSET, window_length_secs= STFT_WINDOW_LENGTH_SECONDS, hop_length_secs= STFT_HOP_LENGTH_SECONDS, num_mel_bins= NUM_MEL_BINS, lower_edge_hertz= MEL_MIN_HZ, upper_edge_hertz= MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / STFT_HOP_LENGTH_SECONDS example_window_length = int(round( EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame(log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
def waveform_to_examples(data, sample_rate): # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
def preprocess_sound(data, sample_rate): if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != SAMPLE_RATE: data = resampy.resample(data, sample_rate, SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=SAMPLE_RATE, log_offset=LOG_OFFSET, window_length_secs=STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=STFT_HOP_LENGTH_SECONDS, num_mel_bins=NUM_MEL_BINS, lower_edge_hertz=MEL_MIN_HZ, upper_edge_hertz=MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / STFT_HOP_LENGTH_SECONDS example_window_length = int( round(EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round(EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame(log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
def _waveform_to_mel_spectrogram_segments(data, sample_rate): """ Converts audio from a single wav file into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. Shape is (num_frame, ) sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is mel_params.STFT_HOP_LENGTH_SECONDS. IMPORTANT: if data.shape < (80000, ) then log_mel_examples.shape=(0, 496, 64). The zero is problematic downstream, so code will have to check for that. """ # Convert to mono if necessary. if len(data.shape) > 1: #print(f'DEBUG: audio channels before={data.shape}') data = np.mean(data, axis=1) #print(f'DEBUG: audio channels after={data.shape}') # Resample to the rate assumed by VGGish. if sample_rate != mel_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, mel_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = log_mel_spectrogram(data, audio_sample_rate=mel_params.SAMPLE_RATE, log_offset=mel_params.LOG_OFFSET, window_length_secs=mel_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=mel_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=mel_params.NUM_MEL_BINS, lower_edge_hertz=mel_params.MEL_MIN_HZ, upper_edge_hertz=mel_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / mel_params.STFT_HOP_LENGTH_SECONDS example_window_length = int( round(mel_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int( round(mel_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) # If log_mel.shape[0] < mel_params.NUM_FRAMES, log_mel_examples will return # an array with log_mel_examples.shape[0] = 0 log_mel_examples = frame(log_mel, window_length=example_window_length, hop_length=example_hop_length) # print(f'DEBUG: data.shape={data.shape}') # print(f'DEBUG: log_mel_examples.shape={log_mel_examples.shape}') if log_mel_examples.shape[0] == 0: print('\nWARNING: audio sample too short! Using all zeros for that example.\n') return log_mel_examples
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ vprint('waveform_to_examples input data shape') vprint(data.shape) # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) vprint('waveform_to_examples resampled mono shape') vprint(data.shape) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) vprint('waveform_to_examples log_mel shape') vprint(log_mel.shape) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) vprint('waveform_to_examples log_mel reshaped') vprint(log_mel_examples.shape) return log_mel_examples
def getmelspectrogram(src): spectrogram = 30 * ( mel_features.log_mel_spectrogram(src, audio_sample_rate=16000, log_offset=0.001, window_length_secs=0.025, hop_length_secs=0.010, num_mel_bins=32, lower_edge_hertz=60, upper_edge_hertz=3800) - np.log(1e-3)) spectrogram = np.array(np.ceil(spectrogram), dtype=np.uint8) return spectrogram
def _compute_spectrogram(self, audio_samples, audio_sample_rate_hz): """Compute log-mel spectrogram and scale it to uint8.""" samples = audio_samples.flatten() / float(2**15) spectrogram = 30 * (mel_features.log_mel_spectrogram( samples, audio_sample_rate_hz, log_offset=0.001, window_length_secs=self.spectrogram_window_length_seconds, hop_length_secs=self.spectrogram_hop_length_seconds, num_mel_bins=self.num_mel_bins, lower_edge_hertz=60, upper_edge_hertz=3800) - np.log(1e-3)) return spectrogram
def generate_mel_spectogram(self, config, filtered_signal): ''' mel = librosa.feature.melspectrogram(y=filtered_signal, sr = config["pre_process"]["sample_rate"], n_mels = config["pre_process"]["n_mels"], fmax=10000 , n_fft = config["pre_process"]["n_fft"], hop_length= config["pre_process"]["hop_length"]) ''' mel = log_mel_spectrogram( filtered_signal, audio_sample_rate=config["pre_process"]["sample_rate"], log_offset=0.01) return mel
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
def wav_to_mel(filename, parser, model): SAMPLE_RATE = parser.getint('mel', 'SAMPLE_RATE') LOG_OFFSET = parser.getfloat('mel', 'LOG_OFFSET') STFT_WINDOW_LENGTH_SECONDS = parser.getfloat('mel', 'STFT_WINDOW_LENGTH_SECONDS') STFT_HOP_LENGTH_SECONDS = parser.getfloat('mel', 'STFT_HOP_LENGTH_SECONDS') MEL_MIN_HZ = parser.getint('mel', 'MEL_MIN_HZ') MEL_MAX_HZ = parser.getint('mel', 'MEL_MAX_HZ') if (model == 'teacher'): NUM_BANDS = parser.getint('mel', 'NUM_BANDS_TEACHER') NUM_MEL_BINS = NUM_BANDS else: NUM_BANDS = parser.getint('mel', 'NUM_BANDS_STUDENT') NUM_MEL_BINS = NUM_BANDS y, sr = librosa.load(filename, mono=True, sr=None) if y.shape[0] < sr * 1 and y.shape[0] > sr * 0.0: y = librosa.util.fix_length(y, int(sr * 1.01)) y = y.T data = y sample_rate = sr if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != SAMPLE_RATE: data = resampy.resample(data, sample_rate, SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=SAMPLE_RATE, log_offset=LOG_OFFSET, window_length_secs=STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=STFT_HOP_LENGTH_SECONDS, num_mel_bins=NUM_MEL_BINS, lower_edge_hertz=MEL_MIN_HZ, upper_edge_hertz=MEL_MAX_HZ) return log_mel
def extract_alt_logmel(path_file, frame_size=0.025, frame_stride=0.010, normalize=True): """This function extracts logmel features using the provided logmel feature extraction code included in the google audioset (vggish) repository. Main difference is it uses Hann Window instead of Hamming window """ sample_rate, signal = wavfile.read(path_file) filter_banks = log_mel_spectrogram(signal, audio_sample_rate=sample_rate, log_offset=0.0, window_length_secs=frame_size, hop_length_secs=frame_stride) if normalize: filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8) # print (np.mean(filter_banks, axis=0)) # print (filter_banks.shape) return filter_banks
def wavedata_to_log_melspectrogram(wav_data, sample_rate): assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype data = wav_data / 32768.0 # Convert to [-1.0, +1.0] if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) return log_mel
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) log_mel = np.asarray( torchvision.transforms.Resize((96, 64))(Image.fromarray(log_mel))) return np.array([log_mel])
def wavfile_to_examples(wav_file): sample_rate, wav_data = wavfile.read(wav_file) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype data = wav_data / 32768.0 # Convert to [-1.0, +1.0] # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) if len(data) == 0: return 0 if sample_rate != params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, params.SAMPLE_RATE) # Compute log mel spectrogram features for each short audios (log FBANK) log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=params.SAMPLE_RATE, log_offset=params.LOG_OFFSET, window_length_secs=params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=params.NUM_MEL_BINS, # here forced the num_mel_bins lower_edge_hertz=params.MEL_MIN_HZ, upper_edge_hertz=params.MEL_MAX_HZ) features_sample_rate = 1.0 / params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round(params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round(params.EXAMPLE_HOP_SECONDS * features_sample_rate)) # added: zero pad the frame to expected frame number for each example log-mel FBANK if log_mel.shape[0] % params.NUM_FRAMES: pad_data = np.zeros((int(np.ceil(1.0*log_mel.shape[0]/params.NUM_FRAMES)*params.NUM_FRAMES),log_mel.shape[1])) pad_data[:log_mel.shape[0],:log_mel.shape[1]] = log_mel log_mel = pad_data log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: - Length of the audio_sample after padding. - 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) ###################################################################### olength = len(data) temp_data = [] OVERLAP_SAMPLE_RATE = int(0.5 * vggish_params.SAMPLE_RATE) for i in range(0, len(data), OVERLAP_SAMPLE_RATE): end = i + vggish_params.SAMPLE_RATE chunk = data[i:min(end, len(data))] temp_data.extend(chunk) pad_length = vggish_params.SAMPLE_RATE - (len(temp_data) % OVERLAP_SAMPLE_RATE) temp_data = np.asarray(temp_data) # limit = int(np.ceil(2*len(data)/float(vggish_params.SAMPLE_RATE))) data = np.pad(temp_data, (0, pad_length), 'constant') ###################################################################### # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int( round(vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int( round(vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame(log_mel, window_length=example_window_length, hop_length=example_hop_length) return olength, len(data), log_mel_examples
def waveform_to_examples(data, sample_rate, file_path): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # begin mod audio_sample_rate = vggish_params.SAMPLE_RATE log_offset = vggish_params.LOG_OFFSET window_length_secs = vggish_params.STFT_WINDOW_LENGTH_SECONDS hop_length_secs = vggish_params.STFT_HOP_LENGTH_SECONDS num_mel_bins = vggish_params.NUM_MEL_BINS lower_edge_hertz = vggish_params.MEL_MIN_HZ upper_edge_hertz = vggish_params.MEL_MAX_HZ #end mod # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int( round(vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int( round(vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame(log_mel, window_length=example_window_length, hop_length=example_hop_length) output_csv_dict = { "file_name": os.path.basename(file_path), "audio_sample_rate": audio_sample_rate, "log_offset": log_offset, "window_length_secs": window_length_secs, "hop_length_secs": hop_length_secs, "num_mel_bins": num_mel_bins, "lower_edge_hertz": lower_edge_hertz, "log_mel": log_mel } #dict_to_csv(output_csv_dict) return output_csv_dict, log_mel_examples