Пример #1
0
    def collect_data_for_one_sample(self, sample_name):
        # Get root dirs
        unprocessed_dir = os.path.join(self.data_root_dir, 'mix_clean')
        groundtruth_spk0_dir = os.path.join(self.data_root_dir, 's1')
        groundtruth_spk1_dir = os.path.join(self.data_root_dir, 's2')

        # Get full file paths
        unprocessed_file = os.path.join(unprocessed_dir, sample_name + '.wav')
        groundtruth_spk0_file = os.path.join(groundtruth_spk0_dir, sample_name + '.wav')
        groundtruth_spk1_file = os.path.join(groundtruth_spk1_dir, sample_name + '.wav')

        # Collect signals
        unprocessed, _ = load(unprocessed_file, sr=self.samplerate_hz)
        groundtruth_spk0, _ = load(groundtruth_spk0_file, sr=self.samplerate_hz)
        groundtruth_spk1, _ = load(groundtruth_spk1_file, sr=self.samplerate_hz)

        # Determine start point of segment
        if unprocessed.size > self.samples_per_utterance:
            max_shift = unprocessed.size - self.samples_per_utterance
            start_point = np.random.randint(max_shift)
        else:
            start_point = 0

        # Cut segment out segment
        unprocessed = fix_length(unprocessed[start_point:], self.samples_per_utterance)
        groundtruth_spk0 = fix_length(groundtruth_spk0[start_point:], self.samples_per_utterance)
        groundtruth_spk1 = fix_length(groundtruth_spk1[start_point:], self.samples_per_utterance)

        groundtruth = np.stack([groundtruth_spk0, groundtruth_spk1], axis=0)

        return unprocessed, groundtruth
Пример #2
0
 def separate_single_mixture(self, mixture):
     original_length = mixture.size
     mixture_padded = fix_length(mixture, self.max_length)
     speaker_signals_padded = self.tasnet.model.predict(
         np.expand_dims(mixture_padded, axis=0))
     speaker_signals = fix_length(speaker_signals_padded[0, :, :],
                                  original_length,
                                  axis=1)
     return speaker_signals
Пример #3
0
    def transform_audio(self, y):
        '''Compute the Mel spectrogram

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape=(n_frames, n_mels)
                The Mel spectrogram
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        mel = np.sqrt(melspectrogram(y=y, sr=self.sr,
                                     n_fft=self.n_fft,
                                     hop_length=self.hop_length,
                                     n_mels=self.n_mels,
                                     fmax=self.fmax)).astype(np.float32)

        mel = fix_length(mel, n_frames)

        if self.log:
            mel = amplitude_to_db(mel, ref=np.max)

        return {'mag': mel.T[self.idx]}
Пример #4
0
def resample(y,
             orig_sr,
             target_sr,
             res_type='kaiser_best',
             fix=True,
             scale=False,
             **kwargs):
    # First, validate the audio buffer
    util.valid_audio(y, mono=False)

    if orig_sr == target_sr:
        return y

    ratio = float(target_sr) / orig_sr

    n_samples = int(np.ceil(y.shape[-1] * ratio))

    if res_type == 'scipy':
        y_hat = scipy.signal.resample(y, n_samples, axis=-1)
    else:
        y_hat = resampy.resample(y,
                                 orig_sr,
                                 target_sr,
                                 filter=res_type,
                                 axis=-1)

    if fix:
        y_hat = util.fix_length(y_hat, n_samples, **kwargs)

    if scale:
        y_hat /= np.sqrt(ratio)

    return np.ascontiguousarray(y_hat, dtype=y.dtype)
Пример #5
0
    def transform_audio(self, y):
        '''Compute the STFT magnitude and phase.

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                STFT magnitude

            data['phase'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                STFT phase
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        D = stft(y, hop_length=self.hop_length, n_fft=self.n_fft)

        D = fix_length(D, n_frames)

        mag, phase = magphase(D)
        if self.log:
            mag = amplitude_to_db(mag, ref=np.max)

        return {
            'mag': mag.T[self.idx].astype(np.float32),
            'phase': np.angle(phase.T)[self.idx].astype(np.float32)
        }
Пример #6
0
def to_stft(seq, nfft):
    """
	:param seq:  Raw audio
	:param nfft: parameter of STFT
	:return: STFT of the input seq, broken down into magnitude in one channel and phase in the other.
	"""
    nfft_padlen = int(len(seq) + nfft / 2)
    stft = lc.stft(fix_length(seq, nfft_padlen), n_fft=nfft)
    return np.array([np.abs(stft), np.angle(stft)]).transpose(1, 2, 0)
Пример #7
0
 def __getitem__(self, index):
     file_path, class_label = self.file_names[index]
     (seq, _) = load(file_path, sr=self.sample_rate, mono=True)
     seq = fix_length(seq, size=self.audio_length, mode='edge')
     return torch.cat((
         torch.LongTensor([class_label]),
         # torch.LongTensor(self.overlap_len)
         #      .fill_(utils.q_zero(self.q_levels)),
         utils.linear_quantize(torch.from_numpy(seq), self.q_levels)))
Пример #8
0
def to_image(seq, nfft):
    ''' Spectrogram computation for a sequence seq 
        Returns
        -------
            abs (magnitude) and angle (phase)
            of spectrogram
    '''
    nfft_padlen = int(len(seq) + nfft / 2)
    stft = lc.stft(fix_length(seq, nfft_padlen), n_fft=nfft)
    return np.array([np.abs(stft), np.angle(stft)]).transpose(1, 2, 0)
Пример #9
0
 def get_annotations(self, file_name, features, time_resolution):
     label_file = self.wav_to_labels[file_name]
     labels = read_csv(label_file, delimiter='\t', header=None)
     labels.columns = ['event_onset', 'event_offset', 'event_label']
     event_roll = event_list_to_event_roll(labels.to_dict('records'),
                                           self.label_list, time_resolution)
     if event_roll.shape[0] > features.shape[0]:
         event_roll = event_roll[:len(features)]
     else:
         event_roll = fix_length(event_roll, features.shape[0], axis=0)
     assert event_roll.shape[0] == features.shape[0]
     return event_roll
Пример #10
0
def predict(file):

    label_encoder = LabelEncoder()
    label_encoder.classes_ = np.load(os.path.join(outdir, encoder_filename))

    model_json_handle = open(os.path.join(outdir, model_filename), "r")

    model_json = model_json_handle.read()
    model_json_handle.close()

    model = model_from_json(model_json)

    model.load_weights(os.path.join(outdir, model_weights_filename))

    model.compile(loss='categorical_crossentropy',
                  metrics=['accuracy'],
                  optimizer='adam')

    file_path = os.path.join(os.getcwd(), file)

    y, sr = librosa.load(file_path, res_type='kaiser_fast')

    prediction_feature = np.array([get_mfcc(y, sr)])

    predicted_proba_vector = model.predict_proba(prediction_feature)
    predicted_proba = predicted_proba_vector[0]

    fixed_size = 44100

    centroid = spectral_centroid(y=y, sr=sr)
    frequency = np.average(centroid)

    centroid = fix_length(centroid, size=fixed_size)

    length = librosa.get_duration(y=y, sr=sr)

    result = {
        'file_path': file_path,
        'classes': {},
        'position': {
            'frequency': frequency,
            'length': length
        }
    }

    for i in range(len(predicted_proba)):
        category = label_encoder.inverse_transform(np.array([i]))
        result['classes'][category[0]] = format(predicted_proba[i], '.32f')

    return result
Пример #11
0
    def getClassSplit(self, class_num=0, seq_len=64):
        file_path = ''
        for key in self.class_mapping:
            if self.class_mapping[key] == class_num:
                file_path = join(self.path, key)
        files = listdir(file_path)
        result = None

        pick_one = files[np.random.randint(0, len(files))]
        pick_one = join(file_path, pick_one)
        seq, _ = load(pick_one, sr=self.sample_rate, mono=True)
        seq = fix_length(seq, size=self.audio_length, mode='edge')

        while result is None or len(result) != seq_len:
            start_idx = np.random.randint(0, len(seq - seq_len - 1))
            result = seq[start_idx:start_idx + seq_len]

        return result
Пример #12
0
    def transform_audio(self, y):
        '''Compute the HCQT

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape = (n_frames, n_bins, n_harmonics)
                The CQT magnitude

            data['phase']: np.ndarray, shape = mag.shape
                The CQT phase
        '''
        cqtm, phase = [], []

        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        for h in self.harmonics:
            C = cqt(y=y,
                    sr=self.sr,
                    hop_length=self.hop_length,
                    fmin=self.fmin * h,
                    n_bins=(self.n_octaves * self.over_sample * 12),
                    bins_per_octave=(self.over_sample * 12))

            C = fix_length(C, n_frames)

            C, P = magphase(C)
            if self.log:
                C = amplitude_to_db(C, ref=np.max)
            cqtm.append(C)
            phase.append(P)

        cqtm = to_dtype(np.asarray(cqtm), self.dtype)
        phase = np.angle(np.asarray(phase))

        dphase = to_dtype(phase_diff(self._index(phase), self.conv),
                          self.dtype)

        return {'mag': self._index(cqtm), 'dphase': dphase}
Пример #13
0
    def transform_audio(self, y, func):
        '''Compute the transform

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape = (n_frames, n_bins)
                The CQT magnitude

            data['phase']: np.ndarray, shape = mag.shape
                The CQT phase
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))
        '''C = cqt(y=y, sr=self.sr, hop_length=self.hop_length,
                fmin=self.fmin,
                n_bins=(self.n_octaves * self.over_sample * 12),
                bins_per_octave=(self.over_sample * 12))'''

        C = func(y=y,
                 sr=self.sr,
                 hop_length=self.hop_length,
                 fmin=self.fmin,
                 n_bins=(self.n_octaves * self.over_sample * 12),
                 bins_per_octave=(self.over_sample * 12))

        C = fix_length(C, n_frames)

        cqtm, phase = magphase(C)
        if self.log:
            cqtm = amplitude_to_db(cqtm, ref=np.max)

        return {
            'mag': cqtm.T.astype(np.float32)[self.idx],
            'phase': np.angle(phase).T.astype(np.float32)[self.idx]
        }
Пример #14
0
def loadTransform(audioChunk):
    if audioChunk.endswith(".wav"):
        windowNum = 2
        sound = load(audioChunk, sr=16000)
        window = windowNum * 16000
        X = []
        a, b = 0, 80000
        while len(sound[0]) - b >= 0:
            X.append(sound[0][a:b])
            a += window
            b += window
        X.append(sound[0][a:b])
        X[-1] = fix_length(X[-1], 80000)
        os.remove(audioChunk)
        X = np.array(X)
        return X
    else:
        newFile = convertFile(audioChunk)
        try:
            X = loadTransform(newFile)
            return X
        except FileNotFoundError:
            return "Error with processing your request"
Пример #15
0
    def transform_audio(self, y):
        '''Compute the tempogram

        Parameters
        ----------
        y : np.ndarray
            Audio buffer

        Returns
        -------
        data : dict
            data['tempogram'] : np.ndarray, shape=(n_frames, win_length)
                The tempogram
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        tgram = tempogram(y=y,
                          sr=self.sr,
                          hop_length=self.hop_length,
                          win_length=self.win_length).astype(np.float32)

        tgram = fix_length(tgram, n_frames)
        return {'tempogram': tgram.T[self.idx]}
Пример #16
0
def hcqt(y,
         sr=22050,
         hop_size=256,
         fmin=32.7,
         bins_per_octave=60,
         n_octaves=6,
         harmonics=(0.5, 1, 2, 3, 4, 5)):
    """
    Harmonic CQT. Compute CQT at harmonics of `fmin`. See librosa for cqt params.
    """

    cqt_mag, cqt_phase = [], []

    n_frames = time_to_frames(get_duration(y=y, sr=sr),
                              sr=sr,
                              hop_length=hop_size)

    for h in harmonics:
        y_cqt = cqt(y=y,
                    sr=sr,
                    hop_length=hop_size,
                    fmin=fmin * h,
                    n_bins=n_octaves * bins_per_octave,
                    bins_per_octave=bins_per_octave,
                    res_type='kaiser_best')

        y_cqt = fix_length(y_cqt, n_frames)

        y_cqt_mag, y_cqt_phase = magphase(y_cqt)

        cqt_mag.append(y_cqt_mag)
        cqt_phase.append(y_cqt_phase)

    cqt_mag = np.asarray(cqt_mag).astype(np.float32)
    cqt_phase = np.angle(np.asarray(cqt_phase)).astype(np.float32)

    return cqt_mag, cqt_phase
Пример #17
0
fertility_griffin = '/home/jan/synthesized_audio/fertility_griffin'

# TODO:
# load each file, find max len and pad all remaining to the same length. Save
for f in files:
    dv = load(os.path.join(deepvoice, f))[0]
    et = load(os.path.join(efficient_tts, f))[0]
    t2 = load(os.path.join(tacotron2, f))[0]
    fert = load(os.path.join(fertility, f))[0]
    fert_grif = load(os.path.join(fertility_griffin, f))[0]
    ref = load(os.path.join(reference, f))[0]

    max_len = max(len(dv), len(et), len(t2), len(ref), len(fert),
                  len(fert_grif))

    dv = fix_length(dv, max_len)[:, None].repeat(2, axis=1)
    et = fix_length(et, max_len)  # [:, None].repeat(2, axis=1)
    t2 = fix_length(t2, max_len)[:, None].repeat(2, axis=1)
    ref = fix_length(ref, max_len)[:, None].repeat(2, axis=1)
    fert = fix_length(fert, max_len)[:, None].repeat(2, axis=1)
    fert_grif = fix_length(fert_grif, max_len)[:, None].repeat(2, axis=1)

    sf.write(os.path.join(deepvoice, f), dv, 22050, subtype='PCM_16')
    sf.write(os.path.join(efficient_tts, f), et, 22050, subtype='PCM_16')
    sf.write(os.path.join(tacotron2, f), t2, 22050, subtype='PCM_16')
    sf.write(os.path.join(reference, f), ref, 22050, subtype='PCM_16')
    sf.write(os.path.join(fertility, f), fert, 22050, subtype='PCM_16')
    sf.write(os.path.join(fertility_griffin, f),
             fert_grif,
             22050,
             subtype='PCM_16')