Python log_mel_spectrogram 예제들, mel_features.log_mel_spectrogram Python 예제들

예제 #1

0

파일 보기

def shorter_waveform_to_examples(data):
    """
    Compute the spectrogram for each short audios
    Input: short audio data
    Output: list of spectrograms in this short audio, eahch with params.EXAMPLE_WINDOW_SECONDS, hopped by params.EXAMPLE_HOP_SECONDS
    """
    # Compute log mel spectrogram features for each short audios
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=params.SAMPLE_RATE,
        log_offset=params.LOG_OFFSET,
        window_length_secs=params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=params.NUM_MEL_BINS, # here forced the num_mel_bins
        lower_edge_hertz=params.MEL_MIN_HZ,
        upper_edge_hertz=params.MEL_MAX_HZ)

    #(data.shape[0]/params.SAMPLE_RATE*1000-25)/10+1 FRAMES x num_mel_bins

    # Frame features into examples
    # Each example is [100x513]->[100x64bins] (non-overlapping)
    features_sample_rate = 1.0 / params.STFT_HOP_LENGTH_SECONDS #frames every second
    example_window_length = int(round(params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(round(params.EXAMPLE_HOP_SECONDS * features_sample_rate))
    log_mel_examples = mel_features.frame(
        log_mel,
        window_length=example_window_length,
        hop_length=example_hop_length)
    return log_mel_examples

예제 #2

0

파일 보기

파일: TrainCode.py 프로젝트: pranavlal30/dhishoom

def wavfile_to_examples(wav_file):
  
	sample_rate, wav_data = wavfile.read(wav_file)
	assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
	data = wav_data / 32768.0 # Convert to [-1.0, +1.0]

	# Convert to mono.
	if len(data.shape) > 1:
		data = np.mean(data, axis=1)
	# Resample to the rate assumed by VGGish.
	if sample_rate != SAMPLE_RATE:
		data = resampy.resample(data, sample_rate, SAMPLE_RATE)

	# Compute log mel spectrogram features.
	log_mel = mel_features.log_mel_spectrogram(data,
											audio_sample_rate= SAMPLE_RATE,
											log_offset= LOG_OFFSET,
											window_length_secs= STFT_WINDOW_LENGTH_SECONDS,
											hop_length_secs= STFT_HOP_LENGTH_SECONDS,
											num_mel_bins= NUM_MEL_BINS,
											lower_edge_hertz= MEL_MIN_HZ,
											upper_edge_hertz= MEL_MAX_HZ)

	# Frame features into examples.
	features_sample_rate = 1.0 /  STFT_HOP_LENGTH_SECONDS
	example_window_length = int(round( EXAMPLE_WINDOW_SECONDS * features_sample_rate))
	example_hop_length = int(round( EXAMPLE_HOP_SECONDS * features_sample_rate))
	log_mel_examples = mel_features.frame(log_mel,
										window_length=example_window_length,
	
										hop_length=example_hop_length)

	return log_mel_examples

예제 #3

0

파일 보기

def waveform_to_examples(data, sample_rate):
  # Convert to mono.
  if len(data.shape) > 1:
    data = np.mean(data, axis=1)

  # Compute log mel spectrogram features.
  log_mel = mel_features.log_mel_spectrogram(
      data,
      audio_sample_rate=vggish_params.SAMPLE_RATE,
      log_offset=vggish_params.LOG_OFFSET,
      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=vggish_params.NUM_MEL_BINS,
      lower_edge_hertz=vggish_params.MEL_MIN_HZ,
      upper_edge_hertz=vggish_params.MEL_MAX_HZ)

  # Frame features into examples.
  features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
  example_window_length = int(round(
      vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
  example_hop_length = int(round(
      vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
  log_mel_examples = mel_features.frame(
      log_mel,
      window_length=example_window_length,
      hop_length=example_hop_length)
  return log_mel_examples

예제 #4

0

파일 보기

def preprocess_sound(data, sample_rate):
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, SAMPLE_RATE)

    # Compute log mel spectrogram features.
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=SAMPLE_RATE,
        log_offset=LOG_OFFSET,
        window_length_secs=STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=NUM_MEL_BINS,
        lower_edge_hertz=MEL_MIN_HZ,
        upper_edge_hertz=MEL_MAX_HZ)

    # Frame features into examples.
    features_sample_rate = 1.0 / STFT_HOP_LENGTH_SECONDS
    example_window_length = int(
        round(EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(round(EXAMPLE_HOP_SECONDS * features_sample_rate))
    log_mel_examples = mel_features.frame(log_mel,
                                          window_length=example_window_length,
                                          hop_length=example_hop_length)
    return log_mel_examples

예제 #5

0

파일 보기

def _waveform_to_mel_spectrogram_segments(data, sample_rate):
    """
    Converts audio from a single wav file into an array of examples for VGGish.

    Args:
        data: np.array of either one dimension (mono) or two dimensions
          (multi-channel, with the outer dimension representing channels).
          Each sample is generally expected to lie in the range [-1.0, +1.0],
          although this is not required. Shape is (num_frame, )
        sample_rate: Sample rate of data.

    Returns:
        3-D np.array of shape [num_examples, num_frames, num_bands] which represents
        a sequence of examples, each of which contains a patch of log mel
        spectrogram, covering num_frames frames of audio and num_bands mel frequency
        bands, where the frame length is mel_params.STFT_HOP_LENGTH_SECONDS.

    IMPORTANT: if data.shape < (80000, ) then log_mel_examples.shape=(0, 496, 64).
        The zero is problematic downstream, so code will have to check for that.
    """

    # Convert to mono if necessary.
    if len(data.shape) > 1:
        #print(f'DEBUG: audio channels before={data.shape}')
        data = np.mean(data, axis=1)
        #print(f'DEBUG: audio channels after={data.shape}')

    # Resample to the rate assumed by VGGish.
    if sample_rate != mel_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, mel_params.SAMPLE_RATE)

    # Compute log mel spectrogram features.
    log_mel = log_mel_spectrogram(data,
                                  audio_sample_rate=mel_params.SAMPLE_RATE,
                                  log_offset=mel_params.LOG_OFFSET,
                                  window_length_secs=mel_params.STFT_WINDOW_LENGTH_SECONDS,
                                  hop_length_secs=mel_params.STFT_HOP_LENGTH_SECONDS,
                                  num_mel_bins=mel_params.NUM_MEL_BINS,
                                  lower_edge_hertz=mel_params.MEL_MIN_HZ,
                                  upper_edge_hertz=mel_params.MEL_MAX_HZ)

    # Frame features into examples.
    features_sample_rate = 1.0 / mel_params.STFT_HOP_LENGTH_SECONDS
    example_window_length = int(
        round(mel_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(
        round(mel_params.EXAMPLE_HOP_SECONDS * features_sample_rate))

    # If log_mel.shape[0] < mel_params.NUM_FRAMES, log_mel_examples will return
    #   an array with log_mel_examples.shape[0] = 0
    log_mel_examples = frame(log_mel,
                             window_length=example_window_length,
                             hop_length=example_hop_length)

    # print(f'DEBUG: data.shape={data.shape}')
    # print(f'DEBUG: log_mel_examples.shape={log_mel_examples.shape}')
    if log_mel_examples.shape[0] == 0:
        print('\nWARNING: audio sample too short! Using all zeros for that example.\n')
    return log_mel_examples

예제 #6

0

파일 보기

def waveform_to_examples(data, sample_rate):
  """Converts audio waveform into an array of examples for VGGish.

  Args:
    data: np.array of either one dimension (mono) or two dimensions
      (multi-channel, with the outer dimension representing channels).
      Each sample is generally expected to lie in the range [-1.0, +1.0],
      although this is not required.
    sample_rate: Sample rate of data.

  Returns:
    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
    a sequence of examples, each of which contains a patch of log mel
    spectrogram, covering num_frames frames of audio and num_bands mel frequency
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
  """
  vprint('waveform_to_examples input data shape')
  vprint(data.shape)

  # Convert to mono.
  if len(data.shape) > 1:
    data = np.mean(data, axis=1)
  # Resample to the rate assumed by VGGish.
  if sample_rate != vggish_params.SAMPLE_RATE:
    data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

  vprint('waveform_to_examples resampled mono shape')
  vprint(data.shape)
  
  # Compute log mel spectrogram features.
  log_mel = mel_features.log_mel_spectrogram(
      data,
      audio_sample_rate=vggish_params.SAMPLE_RATE,
      log_offset=vggish_params.LOG_OFFSET,
      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=vggish_params.NUM_MEL_BINS,
      lower_edge_hertz=vggish_params.MEL_MIN_HZ,
      upper_edge_hertz=vggish_params.MEL_MAX_HZ)

  vprint('waveform_to_examples log_mel shape')
  vprint(log_mel.shape)

  # Frame features into examples.
  features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
  example_window_length = int(round(
      vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
  example_hop_length = int(round(
      vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
  log_mel_examples = mel_features.frame(
      log_mel,
      window_length=example_window_length,
      hop_length=example_hop_length)

  vprint('waveform_to_examples log_mel reshaped')
  vprint(log_mel_examples.shape)
  return log_mel_examples

예제 #7

0

파일 보기

파일: app.py 프로젝트: darshpanchal/FriDay-demo

def getmelspectrogram(src):
    spectrogram = 30 * (
        mel_features.log_mel_spectrogram(src,
                                         audio_sample_rate=16000,
                                         log_offset=0.001,
                                         window_length_secs=0.025,
                                         hop_length_secs=0.010,
                                         num_mel_bins=32,
                                         lower_edge_hertz=60,
                                         upper_edge_hertz=3800) - np.log(1e-3))
    spectrogram = np.array(np.ceil(spectrogram), dtype=np.uint8)
    return spectrogram

예제 #8

0

파일 보기

파일: model.py 프로젝트: JSwaim22/ValtClassy

 def _compute_spectrogram(self, audio_samples, audio_sample_rate_hz):
     """Compute log-mel spectrogram and scale it to uint8."""
     samples = audio_samples.flatten() / float(2**15)
     spectrogram = 30 * (mel_features.log_mel_spectrogram(
         samples,
         audio_sample_rate_hz,
         log_offset=0.001,
         window_length_secs=self.spectrogram_window_length_seconds,
         hop_length_secs=self.spectrogram_hop_length_seconds,
         num_mel_bins=self.num_mel_bins,
         lower_edge_hertz=60,
         upper_edge_hertz=3800) - np.log(1e-3))
     return spectrogram

예제 #9

0

파일 보기

파일: extract_features.py 프로젝트: gustavodsf/audio-process

 def generate_mel_spectogram(self, config, filtered_signal):
     '''
 mel = librosa.feature.melspectrogram(y=filtered_signal, 
                                       sr = config["pre_process"]["sample_rate"], 
                                       n_mels = config["pre_process"]["n_mels"], 
                                       fmax=10000 , 
                                       n_fft = config["pre_process"]["n_fft"], 
                                       hop_length= config["pre_process"]["hop_length"])
 '''
     mel = log_mel_spectrogram(
         filtered_signal,
         audio_sample_rate=config["pre_process"]["sample_rate"],
         log_offset=0.01)
     return mel

예제 #10

0

파일 보기

파일: vggish_input.py 프로젝트: 812864539/models

def waveform_to_examples(data, sample_rate):
  """Converts audio waveform into an array of examples for VGGish.

  Args:
    data: np.array of either one dimension (mono) or two dimensions
      (multi-channel, with the outer dimension representing channels).
      Each sample is generally expected to lie in the range [-1.0, +1.0],
      although this is not required.
    sample_rate: Sample rate of data.

  Returns:
    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
    a sequence of examples, each of which contains a patch of log mel
    spectrogram, covering num_frames frames of audio and num_bands mel frequency
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
  """
  # Convert to mono.
  if len(data.shape) > 1:
    data = np.mean(data, axis=1)
  # Resample to the rate assumed by VGGish.
  if sample_rate != vggish_params.SAMPLE_RATE:
    data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

  # Compute log mel spectrogram features.
  log_mel = mel_features.log_mel_spectrogram(
      data,
      audio_sample_rate=vggish_params.SAMPLE_RATE,
      log_offset=vggish_params.LOG_OFFSET,
      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=vggish_params.NUM_MEL_BINS,
      lower_edge_hertz=vggish_params.MEL_MIN_HZ,
      upper_edge_hertz=vggish_params.MEL_MAX_HZ)

  # Frame features into examples.
  features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
  example_window_length = int(round(
      vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
  example_hop_length = int(round(
      vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
  log_mel_examples = mel_features.frame(
      log_mel,
      window_length=example_window_length,
      hop_length=example_hop_length)
  return log_mel_examples

예제 #11

0

파일 보기

파일: make_mel.py 프로젝트: hardhat5/knowledge-distillation-ust

def wav_to_mel(filename, parser, model):

    SAMPLE_RATE = parser.getint('mel', 'SAMPLE_RATE')
    LOG_OFFSET = parser.getfloat('mel', 'LOG_OFFSET')
    STFT_WINDOW_LENGTH_SECONDS = parser.getfloat('mel',
                                                 'STFT_WINDOW_LENGTH_SECONDS')
    STFT_HOP_LENGTH_SECONDS = parser.getfloat('mel', 'STFT_HOP_LENGTH_SECONDS')
    MEL_MIN_HZ = parser.getint('mel', 'MEL_MIN_HZ')
    MEL_MAX_HZ = parser.getint('mel', 'MEL_MAX_HZ')

    if (model == 'teacher'):
        NUM_BANDS = parser.getint('mel', 'NUM_BANDS_TEACHER')
        NUM_MEL_BINS = NUM_BANDS

    else:
        NUM_BANDS = parser.getint('mel', 'NUM_BANDS_STUDENT')
        NUM_MEL_BINS = NUM_BANDS

    y, sr = librosa.load(filename, mono=True, sr=None)

    if y.shape[0] < sr * 1 and y.shape[0] > sr * 0.0:
        y = librosa.util.fix_length(y, int(sr * 1.01))

    y = y.T

    data = y
    sample_rate = sr

    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, SAMPLE_RATE)
    # Compute log mel spectrogram features.
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=SAMPLE_RATE,
        log_offset=LOG_OFFSET,
        window_length_secs=STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=NUM_MEL_BINS,
        lower_edge_hertz=MEL_MIN_HZ,
        upper_edge_hertz=MEL_MAX_HZ)

    return log_mel

예제 #12

0

파일 보기

파일: audio_features.py 프로젝트: mia2mia/emotionrecognition

def extract_alt_logmel(path_file,
                       frame_size=0.025,
                       frame_stride=0.010,
                       normalize=True):
    """This function extracts logmel features using the provided logmel feature extraction
        code included in the google audioset (vggish) repository. Main difference is it uses 
        Hann Window instead of Hamming window
    """
    sample_rate, signal = wavfile.read(path_file)
    filter_banks = log_mel_spectrogram(signal,
                                       audio_sample_rate=sample_rate,
                                       log_offset=0.0,
                                       window_length_secs=frame_size,
                                       hop_length_secs=frame_stride)
    if normalize:
        filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)
    # print (np.mean(filter_banks, axis=0))
    # print (filter_banks.shape)
    return filter_banks

예제 #13

0

파일 보기

파일: vggish_input.py 프로젝트: GZYZG/Audio-Detectioin

def wavedata_to_log_melspectrogram(wav_data, sample_rate):
    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
    data = wav_data / 32768.0  # Convert to [-1.0, +1.0]
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

    # Compute log mel spectrogram features.
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=vggish_params.SAMPLE_RATE,
        log_offset=vggish_params.LOG_OFFSET,
        window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=vggish_params.NUM_MEL_BINS,
        lower_edge_hertz=vggish_params.MEL_MIN_HZ,
        upper_edge_hertz=vggish_params.MEL_MAX_HZ)

    return log_mel

예제 #14

0

파일 보기

파일: vggish_input_mod.py 프로젝트: YehSweeKhim/Open-Set-Audio-Classification

def waveform_to_examples(data, sample_rate):
    """Converts audio waveform into an array of examples for VGGish.

    Args:
      data: np.array of either one dimension (mono) or two dimensions
        (multi-channel, with the outer dimension representing channels).
        Each sample is generally expected to lie in the range [-1.0, +1.0],
        although this is not required.
      sample_rate: Sample rate of data.

    Returns:
      3-D np.array of shape [num_examples, num_frames, num_bands] which represents
      a sequence of examples, each of which contains a patch of log mel
      spectrogram, covering num_frames frames of audio and num_bands mel frequency
      bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
    """
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

    # Compute log mel spectrogram features.
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=vggish_params.SAMPLE_RATE,
        log_offset=vggish_params.LOG_OFFSET,
        window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=vggish_params.NUM_MEL_BINS,
        lower_edge_hertz=vggish_params.MEL_MIN_HZ,
        upper_edge_hertz=vggish_params.MEL_MAX_HZ)

    log_mel = np.asarray(
        torchvision.transforms.Resize((96, 64))(Image.fromarray(log_mel)))

    return np.array([log_mel])

예제 #15

0

파일 보기

def wavfile_to_examples(wav_file):
    sample_rate, wav_data = wavfile.read(wav_file)
    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
    data = wav_data / 32768.0  # Convert to [-1.0, +1.0]
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    if len(data) == 0:
        return 0
    if sample_rate != params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, params.SAMPLE_RATE)

    # Compute log mel spectrogram features for each short audios (log FBANK)
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=params.SAMPLE_RATE,
        log_offset=params.LOG_OFFSET,
        window_length_secs=params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=params.NUM_MEL_BINS, # here forced the num_mel_bins
        lower_edge_hertz=params.MEL_MIN_HZ,
        upper_edge_hertz=params.MEL_MAX_HZ)

    features_sample_rate = 1.0 / params.STFT_HOP_LENGTH_SECONDS
    example_window_length = int(round(params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(round(params.EXAMPLE_HOP_SECONDS * features_sample_rate))

    # added: zero pad the frame to expected frame number for each example log-mel FBANK
    if log_mel.shape[0] % params.NUM_FRAMES:
        pad_data = np.zeros((int(np.ceil(1.0*log_mel.shape[0]/params.NUM_FRAMES)*params.NUM_FRAMES),log_mel.shape[1]))
        pad_data[:log_mel.shape[0],:log_mel.shape[1]] = log_mel
        log_mel = pad_data

    log_mel_examples = mel_features.frame(
        log_mel,
        window_length=example_window_length,
        hop_length=example_hop_length)
    return log_mel_examples

예제 #16

0

파일 보기

파일: vggish_input_overlap.py 프로젝트: sourav22899/segment-ser

def waveform_to_examples(data, sample_rate):
    """Converts audio waveform into an array of examples for VGGish.

  Args:
    data: np.array of either one dimension (mono) or two dimensions
      (multi-channel, with the outer dimension representing channels).
      Each sample is generally expected to lie in the range [-1.0, +1.0],
      although this is not required.
    sample_rate: Sample rate of data.

  Returns:
    - Length of the audio_sample after padding.
    - 3-D np.array of shape [num_examples, num_frames, num_bands] which represents
    a sequence of examples, each of which contains a patch of log mel
    spectrogram, covering num_frames frames of audio and num_bands mel frequency
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
  """
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

    ######################################################################
    olength = len(data)
    temp_data = []
    OVERLAP_SAMPLE_RATE = int(0.5 * vggish_params.SAMPLE_RATE)
    for i in range(0, len(data), OVERLAP_SAMPLE_RATE):
        end = i + vggish_params.SAMPLE_RATE
        chunk = data[i:min(end, len(data))]
        temp_data.extend(chunk)

    pad_length = vggish_params.SAMPLE_RATE - (len(temp_data) %
                                              OVERLAP_SAMPLE_RATE)
    temp_data = np.asarray(temp_data)
    # limit = int(np.ceil(2*len(data)/float(vggish_params.SAMPLE_RATE)))
    data = np.pad(temp_data, (0, pad_length), 'constant')

    ######################################################################
    # Compute log mel spectrogram features.

    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=vggish_params.SAMPLE_RATE,
        log_offset=vggish_params.LOG_OFFSET,
        window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=vggish_params.NUM_MEL_BINS,
        lower_edge_hertz=vggish_params.MEL_MIN_HZ,
        upper_edge_hertz=vggish_params.MEL_MAX_HZ)

    # Frame features into examples.
    features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
    example_window_length = int(
        round(vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(
        round(vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
    log_mel_examples = mel_features.frame(log_mel,
                                          window_length=example_window_length,
                                          hop_length=example_hop_length)
    return olength, len(data), log_mel_examples

예제 #17

0

파일 보기

def waveform_to_examples(data, sample_rate, file_path):
    """Converts audio waveform into an array of examples for VGGish.

  Args:
    data: np.array of either one dimension (mono) or two dimensions
      (multi-channel, with the outer dimension representing channels).
      Each sample is generally expected to lie in the range [-1.0, +1.0],
      although this is not required.
    sample_rate: Sample rate of data.

  Returns:
    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
    a sequence of examples, each of which contains a patch of log mel
    spectrogram, covering num_frames frames of audio and num_bands mel frequency
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
  """
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

    # begin mod
    audio_sample_rate = vggish_params.SAMPLE_RATE
    log_offset = vggish_params.LOG_OFFSET
    window_length_secs = vggish_params.STFT_WINDOW_LENGTH_SECONDS
    hop_length_secs = vggish_params.STFT_HOP_LENGTH_SECONDS
    num_mel_bins = vggish_params.NUM_MEL_BINS
    lower_edge_hertz = vggish_params.MEL_MIN_HZ
    upper_edge_hertz = vggish_params.MEL_MAX_HZ
    #end mod

    # Compute log mel spectrogram features.
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=vggish_params.SAMPLE_RATE,
        log_offset=vggish_params.LOG_OFFSET,
        window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=vggish_params.NUM_MEL_BINS,
        lower_edge_hertz=vggish_params.MEL_MIN_HZ,
        upper_edge_hertz=vggish_params.MEL_MAX_HZ)

    # Frame features into examples.
    features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
    example_window_length = int(
        round(vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(
        round(vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
    log_mel_examples = mel_features.frame(log_mel,
                                          window_length=example_window_length,
                                          hop_length=example_hop_length)

    output_csv_dict = {
        "file_name": os.path.basename(file_path),
        "audio_sample_rate": audio_sample_rate,
        "log_offset": log_offset,
        "window_length_secs": window_length_secs,
        "hop_length_secs": hop_length_secs,
        "num_mel_bins": num_mel_bins,
        "lower_edge_hertz": lower_edge_hertz,
        "log_mel": log_mel
    }

    #dict_to_csv(output_csv_dict)

    return output_csv_dict, log_mel_examples