Exemplo n.º 1
0
 def predict(self, path):
     self.eval()
     if path.find('.wav'):
         wav = feature.load_audio(wav_path=path, wav=None)
     else:
         wav = feature.load_audio(wav=path)
     spec = feature.spectrogram(wav)
     spec.unsqueeze_(0)
     x_lens = spec.size(-1)
     out = self.cnn(spec)
     out_len = torch.tensor([out.size(-1)])
     text = self.decode(out, out_len)
     self.train()
     return text[0]
Exemplo n.º 2
0
 def __getitem__(self, index):
     wav, transcript = self.idx[index]
     wav = feature.load_audio(os.path.join(dataPath, wav))
     spect = feature.spectrogram(wav)
     transcript = list(
         filter(None, [self.labels.get(x) for x in transcript]))
     return spect, transcript
Exemplo n.º 3
0
    def __getitem__(self, index):
        wav, transcript = self.idx[index]
        wav = feature.load_audio(wav)
        spect = feature.spectrogram(wav)
        transcript = list(filter(None, [self.vocabulary.get(x) for x in transcript]))

        return spect, transcript
Exemplo n.º 4
0
 def predict(self, path):
     self.eval()
     wav = feature.load_audio(path)
     spec = feature.spectrogram(wav)
     spec.unsqueeze_(0)
     x_lens = spec.size(-1)
     out = self.cnn(spec)
     out_len = torch.tensor([out.size(-1)])
     text = self.decode(out, out_len)
     self.train()
     return text[0]
Exemplo n.º 5
0
def predict(f):
    wav = feature.load_audio(f)
    spec = feature.spectrogram(wav)
    spec.unsqueeze_(0)
    with torch.no_grad():
        y = model.cnn(spec)
        y = F.softmax(y, 1)
    y_len = torch.tensor([y.size(-1)])
    y = y.permute(0, 2, 1)  # B * T * V
    print("decoding")
    out, score, offset, out_len = decoder.decode(y, y_len)
    return translate(model.vocabulary, out[0][0], out_len[0][0])
Exemplo n.º 6
0
def main():
    file_path = 'music_data/shortName.flac'
    y = f.load_audio(file_path)

    # COMPUTE SPECTROGRAM
    log_mel_spectrogram = f.compute_spectrogram(y, 2048, 1024, 40)

    # COMPUTE ONSET DETECTION FUNCTION
    # skip normalization
    # norm_spectrogram = o.normalize_frequencies(log_mel_spectrogram)
    odf = o.compute_odf(log_mel_spectrogram) #o.compute_odf(norm_spectrogram)

    # DETECT ONSETS
    peaks = o.apply_threshold(odf, 1500)
    print(odf)
    # print(peaks)
    for i, p in enumerate(peaks):
        if p > 0:
            print(i/100, '   ', p)
Exemplo n.º 7
0
                              'street_fold{}_train.txt'.format(1))
    evaluate_file = os.path.join(evaluation_setup_folder,
                                 'street_fold{}_evaluate.txt'.format(1))
    desc_dict = feature.load_desc_file(train_file, __class_labels)  #make dict
    desc_dict.update(feature.load_desc_file(
        evaluate_file,
        __class_labels))  # contains labels for all the audio in the dataset

    #till here labels are stored for only 1st fold

    # Extract features for all audio files, and save it along with labels
    for audio_filename in os.listdir(audio_folder):
        audio_file = os.path.join(audio_folder, audio_filename)
        print('Extracting features and label for : {}'.format(audio_file))
        y, sr = feature.load_audio(
            audio_file, mono=is_mono,
            fs=sr)  #y = audio data [shape=(signal_length, channel)]
        mbe = None

        #now we extract mel band energies for mono or binaural audio
        if is_mono:
            mbe = feature.extract_mbe(y, sr, nfft, nb_mel_bands).T
        else:
            for ch in range(y.shape[0]):  #for each channel extract mbe
                mbe_ch = feature.extract_mbe(y[ch, :], sr, nfft,
                                             nb_mel_bands).T
                if mbe is None:
                    mbe = mbe_ch
                else:
                    mbe = np.concatenate((mbe, mbe_ch), 1)