Пример #1
0
 def check_model_for_regression(self, modelparams, filenames):
     audios = []
     srs = []
     for filename in filenames:
         audio, sr = sf.read(filename)
         audios.append(audio)
         srs.append(sr)
     n = len(filenames)
     embeddings0, ts0 = openl3.get_audio_embedding(audios,
                                                   srs,
                                                   batch_size=32,
                                                   **modelparams)
     embeddings1, ts1 = openl3.get_audio_embedding(audios,
                                                   srs,
                                                   batch_size=32,
                                                   **modelparams)
     # This is just a sanity check that openl3
     # gives consistent results, we can remove
     # it later.
     for i in range(n):
         assert np.mean(np.abs(embeddings1[i] - embeddings0[i])) <= 1e-6
         assert np.mean(np.abs(ts1[i] - ts0[i])) <= 1e-6
     embeddings2, ts2 = torchopenl3.get_audio_embedding(audios,
                                                        srs,
                                                        batch_size=32,
                                                        **modelparams)
     for i in range(n):
         '''
         We increase the compare paremeter as kapre in openl3 and nnAudio in torchopenl3 giving 
         more mean error. We can expect a prrety good result when we will pretrain model
         '''
         assert np.mean(np.abs(embeddings1[i] - embeddings2[i])) <= 2
         assert np.mean(np.abs(ts1[i] - ts2[i])) <= 2
Пример #2
0
def feature_extraction_l3(file_name):
    audio, sr = sf.read(file_name)
    emb, ts = openl3.get_audio_embedding(audio,
                                         sr,
                                         content_type='env',
                                         embedding_size=512)
    return emb
Пример #3
0
    def check_model_for_regression(self, modelparams, filenames):
        audios = []
        srs = []
        for filename in filenames:
            audio, sr = sf.read(filename)
            audios.append(audio)
            srs.append(sr)
        n = len(filenames)
        embeddings0, ts0 = openl3.get_audio_embedding(audios,
                                                      srs,
                                                      batch_size=32,
                                                      **modelparams)
        embeddings1, ts1 = openl3.get_audio_embedding(audios,
                                                      srs,
                                                      batch_size=32,
                                                      **modelparams)

        # This is just a sanity check that openl3
        # gives consistent results, we can remove
        # it later.
        for i in range(n):
            assert embeddings1[0].shape == embeddings0[0].shape
            assert embeddings1[1].shape == embeddings0[1].shape
            assert torch.mean(
                torch.abs(T(embeddings1[i]) - T(embeddings0[i]))) <= 1e-6
            assert torch.mean(torch.abs(T(ts1[i]) - T(ts0[i]))) <= 1e-6
        embeddings2, ts2 = torchopenl3.get_audio_embedding(audios,
                                                           srs,
                                                           batch_size=32,
                                                           sampler="resampy",
                                                           **modelparams)
        for i in range(n):
            """
            We increase the compare paremeter as kapre in openl3 and nnAudio in torchopenl3 giving
            more mean error. We can expect a prrety good result when we will pretrain model
            """
            print(embeddings1[0].shape, embeddings2[0].shape)
            print(embeddings1[1].shape, embeddings2[1].shape)
            print(torch.mean(torch.abs(T(ts1[i]) - T(ts2[i]))))
            print(torch.mean(torch.abs(T(embeddings1[i]) - T(embeddings2[i]))))
            print(torch.mean(torch.abs(T(ts1[i]) - T(ts2[i]))))
            assert embeddings1[0].shape == embeddings2[0].shape
            assert embeddings1[1].shape == embeddings2[1].shape
            assert torch.mean(
                torch.abs(T(embeddings1[i]) - T(embeddings2[i]))) <= 1e-2
            assert torch.mean(torch.abs(T(ts1[i]) - T(ts2[i]))) <= 1e-6
Пример #4
0
    def calculate(self, file_name):
        import openl3
        audio = self.load_audio(file_name, change_sampling_rate=False)
        emb, ts = openl3.get_audio_embedding(
            audio, self.sr,
            model=self.openl3,
            hop_size=self.sequence_hop_time,
            verbose=False
        )

        return emb
Пример #5
0
def test_get_audio_embedding_basic(input_repr, content_type, embedding_size, frontend, chirp_audio_sr):
    hop_size = 0.1
    tol = 1e-5
    # Make sure all embedding types work fine
    audio, sr = chirp_audio_sr
    emb1, ts1 = openl3.get_audio_embedding(
        audio, sr, input_repr=input_repr, content_type=content_type, embedding_size=embedding_size,
        center=True, hop_size=hop_size, verbose=True, frontend=frontend)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[1] == embedding_size
    assert not np.any(np.isnan(emb1))
    K.clear_session()
Пример #6
0
    def extract_feature(self, wave_fp: str):

        x, sr = sf.read(wave_fp)

        #TODO: check x.shape
        # assert (x.shape == (441000,))
        if x.shape[0] == sr * 10 - 1:
            x = np.append(x, 0)
        elif x.shape[0] == sr * 10 + 1:
            x = x[:-1]
        assert (x.shape == (sr * 10, ))  # suppose audio are in 10s

        emb, ts = openl3.get_audio_embedding(x,
                                             sr,
                                             model=self.model,
                                             hop_size=self.hop_size)

        return np.expand_dims(
            emb, axis=0)  # (1, xx, embedding_size), xx=96 when hop_size=0.1
classes = [
    'beach', 'bus', 'cafe/restaurant', 'car', 'city_center', 'forest_path',
    'grocery_store', 'home', 'library', 'metro_station', 'office', 'park',
    'residential_area', 'train', 'tram'
]

input_path = '../../dataset/audio/'
output = '../feat/audio/'

for clas in classes:
    files = os.listdir(input_path + clas)
    for file in files:
        filePath = input_path + clas + '/' + file
        audio, sr = sf.read(filePath)
        #emb, ts = openl3.get_embedding(audio, sr)
        emb, ts = openl3.get_audio_embedding(audio, sr)
        outFileName = output + clas + '/' + file.split('.')[0]
        np.save(outFileName, emb)

input_path = '../../dataset/background/'
output = '../feat/background/'

for clas in classes:
    files = os.listdir(input_path + clas)
    for file in files:
        filePath = input_path + clas + '/' + file
        audio, sr = sf.read(filePath)
        #emb, ts = openl3.get_embedding(audio, sr)
        emb, ts = openl3.get_audio_embedding(audio, sr)
        outFileName = output + clas + '/' + file.split('.')[0]
        np.save(outFileName, emb)
Пример #8
0
import openl3
import soundfile as sf
# import tensorflow as tf
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

if __name__ == "__main__":

    audio, sr = sf.read('/home/hw1-a07/dcase/datasets/TAU-urban-acoustic-scenes-2020-mobile-development/audio/airport-barcelona-0-3-a.wav')
    print(sr)
    print(audio)

    model = openl3.models.load_audio_embedding_model(content_type="music", input_repr="mel256", embedding_size=512)

    emb, ts = openl3.get_audio_embedding(audio, sr, model=model, hop_size=0.1)
    print(emb, ts)

    #openl3-music-mel256-emb512-hop0_1
Пример #9
0
def test_get_audio_embedding(chirp_audio_sr):
    hop_size = 0.1
    tol = 1e-5

    audio, sr = chirp_audio_sr
    emb1, ts1 = openl3.get_audio_embedding(audio, sr,
                                           input_repr="linear", content_type="env", embedding_size=512,
                                           center=True, hop_size=hop_size, verbose=True)
    assert np.all(np.abs(np.diff(ts1) - hop_size) < tol)
    assert emb1.shape[1] == 512
    assert not np.any(np.isnan(emb1))
    K.clear_session()

    # Make sure we can load a model and pass it in
    model = openl3.models.load_audio_embedding_model("linear", "env", 512)
    emb1load, ts1load = openl3.get_audio_embedding(audio, sr, model=model,
                                                   center=True,
                                                   hop_size=hop_size,
                                                   verbose=True)
    assert np.all(np.abs(emb1load - emb1) < tol)
    assert np.all(np.abs(ts1load - ts1) < tol)

    # Make sure that the embeddings are approximately the same with mono and stereo
    audio, sr = sf.read(CHIRP_STEREO_PATH)
    emb2, ts2 = openl3.get_audio_embedding(audio, sr, model=model,
                                           center=True, hop_size=0.1, verbose=True)

    # assert np.all(np.abs(emb1 - emb2) < tol)
    # assert np.all(np.abs(ts1 - ts2) < tol)
    assert not np.any(np.isnan(emb2))

    # Make sure that the embeddings are approximately the same if we resample the audio
    audio, sr = sf.read(CHIRP_44K_PATH)
    emb3, ts3 = openl3.get_audio_embedding(audio, sr, model=model,
                                           center=True, hop_size=0.1, verbose=True)

    # assert np.all(np.abs(emb1 - emb3) < tol)
    # assert np.all(np.abs(ts1 - ts3) < tol)
    assert not np.any(np.isnan(emb3))

    # Make sure empty audio is handled
    audio, sr = sf.read(EMPTY_PATH)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr, model=model,
                  center=True, hop_size=0.1, verbose=True)

    # Make sure user is warned when audio is too short
    audio, sr = sf.read(SHORT_PATH)
    pytest.warns(OpenL3Warning, openl3.get_audio_embedding, audio, sr,
                 model=model, center=False, hop_size=0.1, verbose=True)

    # Make sure short audio can be handled
    emb4, ts4 = openl3.get_audio_embedding(audio, sr, model=model,
                                           center=False, hop_size=0.1, verbose=True)

    assert emb4.shape[0] == 1
    assert emb4.shape[1] == 512
    assert len(ts4) == 1
    assert ts4[0] == 0
    assert not np.any(np.isnan(emb4))

    # Make sure silence is handled
    audio, sr = sf.read(SILENCE_PATH)
    pytest.warns(OpenL3Warning, openl3.get_audio_embedding, audio, sr,
                 model=model, center=True, hop_size=0.1, verbose=True)

    emb5, ts5 = openl3.get_audio_embedding(audio, sr, model=model,
                                           center=True, hop_size=0.1, verbose=True)
    assert emb5.shape[1] == 512
    assert not np.any(np.isnan(emb5))

    # Check for centering
    audio, sr = sf.read(CHIRP_1S_PATH)
    emb6, ts6 = openl3.get_audio_embedding(audio, sr, model=model,
                                           center=True, hop_size=hop_size, verbose=True)
    n_frames = 1 + int((audio.shape[0] + sr//2 - sr) / float(int(hop_size*sr)))
    assert emb6.shape[0] == n_frames

    emb7, ts7 = openl3.get_audio_embedding(audio, sr, model=model,
                                           center=False, hop_size=hop_size, verbose=True)
    n_frames = 1 + int((audio.shape[0] - sr) / float(int(hop_size*sr)))
    assert emb7.shape[0] == n_frames

    # Check for hop size
    hop_size = 0.2
    emb8, ts8 = openl3.get_audio_embedding(audio, sr, model=model,
                                           center=False, hop_size=hop_size, verbose=True)
    n_frames = 1 + int((audio.shape[0] - sr) / float(int(hop_size*sr)))
    assert emb8.shape[0] == n_frames

    # Make sure changing verbosity doesn't break
    openl3.get_audio_embedding(audio, sr, model=model,
                               center=True, hop_size=hop_size, verbose=False)

    # Check batch processing with multiple files with a single sample rate
    audio, sr = sf.read(CHIRP_MONO_PATH)
    hop_size = 0.1
    emb_list, ts_list = openl3.get_audio_embedding([audio, audio], sr,
                                                   model=model, center=True,
                                                   hop_size=hop_size,
                                                   batch_size=4)
    n_frames = 1 + int((audio.shape[0] + sr//2 - sr) / float(int(hop_size*sr)))
    assert len(emb_list) == 2
    assert len(ts_list) == 2
    assert emb_list[0].shape[0] == n_frames
    assert np.allclose(emb_list[0], emb_list[1])
    assert np.allclose(ts_list[0], ts_list[1])

    # Check batch processing with multiple files with individually given sample rates
    emb_list, ts_list = openl3.get_audio_embedding([audio, audio], [sr, sr],
                                                   model=model,
                                                   center=True, hop_size=hop_size,
                                                   batch_size=4)
    n_frames = 1 + int((audio.shape[0] + sr//2 - sr) / float(int(hop_size*sr)))
    assert type(emb_list) == list
    assert type(ts_list) == list
    assert len(emb_list) == 2
    assert len(ts_list) == 2
    assert emb_list[0].shape[0] == n_frames
    assert np.allclose(emb_list[0], emb_list[1])
    assert np.allclose(ts_list[0], ts_list[1])

    # Check batch processing with multiple files with different sample rates
    emb_list, ts_list = openl3.get_audio_embedding([audio, audio], [sr, sr/2],
                                                   model=model,
                                                   center=True, hop_size=hop_size,
                                                   batch_size=4)
    n_frames = 1 + int((audio.shape[0] + sr//2 - sr) / float(int(hop_size*sr)))
    n_frames_2 = 1 + int((audio.shape[0] + sr//4 - sr/2) / float(int(hop_size*sr/2)))
    assert type(emb_list) == list
    assert type(ts_list) == list
    assert len(emb_list) == 2
    assert len(ts_list) == 2
    assert emb_list[0].shape[0] == n_frames
    assert emb_list[1].shape[0] == n_frames_2
    K.clear_session()

    # Make sure invalid arguments don't work
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, "invalid", sr,
                  input_repr="mel256", content_type="music", embedding_size=512,
                  center=True, hop_size=0.1, verbose=True)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr,
                  model="invalid", center=True, hop_size=0.1, verbose=True)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, [sr, sr],
                  input_repr="mel256", content_type="music", embedding_size=512,
                  center=True, hop_size=0.1, verbose=True)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, "invalid",
                  input_repr="mel256", content_type="music", embedding_size=512,
                  center=True, hop_size=0.1, verbose=True)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, -1,
                  input_repr="mel256", content_type="music", embedding_size=512,
                  center=True, hop_size=0.1, verbose=True)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr,
                  input_repr="invalid", content_type="music", embedding_size=512,
                  center=True, hop_size=0.1, verbose=True)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr,
                  input_repr="mel256", content_type="invalid", embedding_size=512,
                  center=True, hop_size=0.1, verbose=True)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr,
                  input_repr="mel256", content_type="invalid", embedding_size=42,
                  center=True, hop_size=0.1, verbose=True)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr,
                  input_repr="mel256", content_type="music", embedding_size="invalid",
                  center=True, hop_size=0.1, verbose=True)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr,
                  input_repr="mel256", content_type="music", embedding_size=512,
                  center=True, hop_size=0, verbose=True)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr,
                  input_repr="mel256", content_type="music", embedding_size=512,
                  center=True, hop_size=-1, verbose=True)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr,
                  input_repr="mel256", content_type="music", embedding_size=512,
                  center=True, hop_size=0.1, verbose=-1)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr,
                  input_repr="mel256", content_type="music", embedding_size=512,
                  center='invalid', hop_size=0.1, verbose=True)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, np.ones((10, 10, 10)), sr,
                  input_repr="mel256", content_type="music", embedding_size=512,
                  center=True, hop_size=0.1, verbose=True)
    pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr,
                  frontend="invalid", center=True, hop_size=0.1, verbose=True)
Пример #10
0
    def preprocess(self, batch_size=16):
        """
        Outputs: Writes to disk the openl3 embedding pickle object for each sample.
        Optionally, it will output the entire matched X and Y numpy pickle objects if label path is provided
        -
        """
        normalizer = Normalizer()
        tmp_output_folder = ""
        if self.video_folder.endswith(".zip"):
            # Unzips files to a temp directory
            tmp_output_folder = self.output_folder.rstrip('/') + "_tmp"
            print(f"Unzipping files to temp dir {tmp_output_folder}...")
            Path(f"{tmp_output_folder}").mkdir(parents=True, exist_ok=True)
            with zipfile.ZipFile(self.video_folder, 'r') as zip_ref:
                zip_ref.extractall(tmp_output_folder)
            print("Finished unzipping files")
        else:
            tmp_output_folder = self.video_folder
            print("Skipping unzipping files as input is a folder")

        Path(f"{self.output_folder}/audio-pickle/").mkdir(parents=True,
                                                          exist_ok=True)

        # Strip the audio from video and store as .wav file
        video_files = sorted(glob.glob(tmp_output_folder + '/*.mp4'))
        video_files_split = np.array_split(np.asarray(video_files),
                                           len(video_files) // batch_size)

        target_labels = []

        if self.label_path is not None:
            targets = []
            target_labels = np.genfromtxt(self.label_path,
                                          delimiter=' ',
                                          dtype='str')

        sr = 0
        all_x = []

        maxlen = int(self.max_len // self.hop_size + 1)

        for i in range(0, len(video_files_split)):

            audio_reads = []

            for f in video_files_split[i]:
                newname = os.path.basename(f)
                output_wav_file = newname + 'extracted_audio.wav'
                ffmpeg_extract_audio(f, "/tmp/" + output_wav_file)
                if self.label_path is not None:
                    target_index = np.where(
                        target_labels[:, 0] == newname[:-4])[0]
                    target_index = int(target_index)
                    target = int(target_labels[:, 1][target_index]) - 1
                    targets.append(target)
                audio_read, sr = sf.read("/tmp/" + output_wav_file)
                audio_reads.append(audio_read)
                print(f"Reading file {output_wav_file} ...")

            X_arr, ts_list = openl3.get_audio_embedding(audio_reads,
                                                        sr,
                                                        batch_size=15,
                                                        hop_size=self.hop_size)

            X = tf.keras.preprocessing.sequence.pad_sequences(X_arr,
                                                              maxlen=maxlen)
            X = np.asarray(X, dtype='float32')

            if i == 0:
                all_x = X
                all_x = np.asarray(all_x, dtype='float32')
            else:
                all_x = np.concatenate((all_x, X), axis=0)

            print(all_x.shape)

        all_x_norm = all_x

        for i in range(0, len(all_x_norm)):
            all_x_norm[i] = normalizer.fit_transform(all_x_norm[i])

        for f in video_files:
            file_name = os.path.basename(f)
            with open(
                    f"{self.output_folder}/audio-pickle/{file_name}-openl3.pkl",
                    "wb") as f_out:
                pickle.dump(all_x_norm[i], f_out)

        if self.label_path is not None:
            with open(f"{self.output_folder}/audio-pickle-all-X-openl3.pkl",
                      "wb") as f_out:
                pickle.dump(all_x_norm, f_out)

            targets = np.asarray(targets)
            with open(f"{self.output_folder}/audio-pickle-all-Y-openl3.pkl",
                      "wb") as f_out:
                pickle.dump(targets, f_out)

        if self.output_file is not None:
            print(f"Starting to zip files to {self.output_file}")

            def zipdir(path, ziph):
                for root, dirs, files in os.walk(path):
                    folder = root[len(path):]
                    for file in files:
                        ziph.write(join(root, file), join(folder, file))

            zipf = zipfile.ZipFile(self.output_file, 'w', zipfile.ZIP_DEFLATED)
            zipdir(self.output_folder, zipf)
            zipf.close()
            print(f"Done zipping files to {self.output_file}")

        print("Done!")
Пример #11
0
audio_tsv = []
label_tsv = []
emb_tsv = []

audio_list, sr_list = [], []
for path in tqdm(paths, desc='Loading audio files ...'):
    audio, sr = sf.read(path)
    audio_list.append(audio)
    sr_list.append(sr)
    filename = Path(path).name
    label = Path(Path(path).parent).stem
    label_tsv.append([ontology_lookup[label]['name']])
    audio_tsv.append([f'{label}/{filename}'])

emb_list, _ = openl3.get_audio_embedding(audio_list,
                                         sr_list,
                                         content_type='env',
                                         hop_size=0.5)
for emb in tqdm(emb_list, desc="Creating embeddings ..."):
    emb = np.mean(emb, axis=0).tolist()
    emb_tsv.append(emb)

with open(f'{OUTPUTDIR}/emb.tsv', 'w') as f:
    for emb in emb_tsv:
        csv.writer(f, delimiter='\t').writerow(emb)
with open(f'{OUTPUTDIR}/label.tsv', 'w') as f:
    for label in label_tsv:
        csv.writer(f, delimiter='\t').writerow(label)
with open(f'{OUTPUTDIR}/audio.tsv', 'w') as f:
    for audio_path in audio_tsv:
        csv.writer(f, delimiter='\t').writerow(audio_path)
Пример #12
0
total = 0
for folder in \
    os.listdir('/green-projects/project-sonyc_redhook/workspace/share/truck_audio/redhook_truck_audio/10s'):
        for file in os.listdir\
        ('/green-projects/project-sonyc_redhook/workspace/share/truck_audio/redhook_truck_audio/10s/' + folder):
            total += 1
            
count = 0
for folder in \
    os.listdir('/green-projects/project-sonyc_redhook/workspace/share/truck_audio/redhook_truck_audio/10s'):
    if(int(folder[8:10]) < 14):
        for file in os.listdir\
        ('/green-projects/project-sonyc_redhook/workspace/share/truck_audio/redhook_truck_audio/10s/' + folder):
            audio_timestamp = int(file.split(".")[0])
            audio, sr = sf.read('/green-projects/project-sonyc_redhook/workspace/share/truck_audio/' +
                                'redhook_truck_audio/10s/' + folder + '/' + file)
            emb, ts = openl3.get_audio_embedding(audio, sr, content_type='env', embedding_size=512)
            embedding = [emb, ts]
            embedding_list.append((audio_timestamp, embedding))
            count += 1
            print('done with ' + str(count) + ' out of ' + str(total))
            sys.stdout.flush()

print('done with all embeddings')
sys.stdout.flush()

with open('embedding_list.pickle', 'wb') as f:
    pickle.dump(embedding_list, f)
print('done dumping embedding list')
sys.stdout.flush()
    def calculate_embeddings(self,
                             max_items_per_class: Optional[int] = 1000,
                             max_classes: Optional[int] = 1000,
                             class_filter: Optional[Union[List, None]] = None,
                             model: Optional[Union[Model, None]] = None,
                             input_repr: Optional[str] = 'mel256',
                             content_type: Optional[str] = 'music',
                             embedding_size: Optional[int] = 6144,
                             center: Optional[bool] = True,
                             hop_size: Optional[float] = 0.1,
                             batch_size: Optional[int] = 32,
                             verbose: Optional[bool] = True) -> None:
        """
        Initializes the calculation process of the OpenL3 embeddings.
        :param max_items_per_class:
               To speed up the calculation process, the amount of processed
               samples can be restricted. If None, all the samples in
               self.audio_paths_by_class will be processed.

        :param max_classes:
               To speed up the calculation process, the amount of classes can
               be restricted. If None, all the found classes will be processed.

        :param class_filter:
               If not None, only classes found in the given filter list will
               be loaded. The class filter can contain superclass or subclass
               labels.

        :param model:
               A custom model for calculating the embeddings. More information
               can be found in the OpenL3 docs. (get_audio_embeddings)

        :param input_repr:
               The input representation of the sample. Can be linear, mel128
               or mel256. More information can be found in the OpenL3 docs.
               (get_audio_embeddings)

        :param content_type:
               The content type of the samples to be processed. Can be music or
               env. More information can be found in the OpenL3 docs.
               (get_audio_embeddings)

        :param embedding_size:
               The size of the calculated embeddings. Can be 512 or 6144.
               More information can be found in the OpenL3 docs.
               (get_audio_embeddings)

        :param center:
               The location of the returned timestamps. More information can be
               found in the OpenL3 docs. (get_audio_embeddings)

        :param hop_size:
               The hop size used to calculate the embeddings. More information
               can be found in the OpenL3 docs. (get_audio_embeddings)

        :param batch_size:
               The number of samples that are fed to the model at once. More
               information can be found in the OpenL3 docs.
               (get_audio_embeddings)

        :param verbose:
               The amount of information printed on the screen during the
               calculation procedure.

        :return: None
        """

        if self.openl3settings is None:
            self.openl3settings = {
                'input_repr': input_repr,
                'content_type': content_type,
                'embedding_size': embedding_size,
                'center': center,
                'hop_size': hop_size
            }

        # Initialize embedding container
        all_embeddings = OpenL3EmbeddingPackageWrapper()

        # Class counter keeps track of how many classes have been processed.
        class_counter = 0
        for class_label in list(self.audio_paths_by_class.keys()):
            if class_filter is not None and class_label not in class_filter:
                self.log(f"Skipping class {class_label}")
                continue
            self.log(f"Processing class {class_label}")

            # Openl3 will process these lists to get the embeddings.
            audio_list = []
            sr_list = []

            # The package list keeps track of all the packages before being
            # added to the all_embeddings container.
            package_list = []

            # Load audio samples and respective sample rates to lists
            self.log("Loading audio...")
            counter = 0
            for audio_path in self.audio_paths_by_class[class_label]:
                audio, sr = lbr.load(audio_path, sr=None)

                # Important metadata needed to play the audio later on when
                # clicked on the plot.
                metadata = {
                    'class': class_label,
                    'sample_id': class_label + '_' + str(counter),
                    'raw_audio_path': audio_path,
                    'original_sr': sr,
                    'openl3settings': self.openl3settings
                }

                # A container package is initialized for each sample
                package = OpenL3EmbeddingPackage(embeddings=None,
                                                 timestamps=None,
                                                 metadata=metadata)

                audio_list.append(audio)
                sr_list.append(sr)
                package_list.append(package)

                if max_items_per_class is not None and \
                   counter >= max_items_per_class:
                    break

                counter += 1

            self.log("Computing embeddings...")

            # Here the embeddings are calculated with the OpenL3 model specified
            # in the arguments.
            emb_list, ts_list \
                = openl3.get_audio_embedding(audio_list, sr_list,
                                             model=model,
                                             input_repr=input_repr,
                                             content_type=content_type,
                                             embedding_size=embedding_size,
                                             center=center,
                                             hop_size=hop_size,
                                             batch_size=batch_size,
                                             verbose=verbose)

            counter = 0
            for embeddings in emb_list:
                package_list[counter].set_embeddings(embeddings)
                all_embeddings.add_package(package_list[counter])
                counter += 1

            if max_classes is not None and class_counter >= max_classes:
                break

            class_counter += 1

        # The container holds all the computed embeddings.
        self.embedding_wrapper = all_embeddings
Пример #14
0
list_of_npy_files = []
for root, dirs, files in os.walk(data_write_dir):
    path = root.split(os.sep)
    for file in files:
        if file.endswith(".npy"):
#             print(os.path.basename(os.path.join(root, file)))
            list_of_npy_files.append(file.split('.')[0])
    
# print(list_of_npy_files)
model = openl3.models.load_audio_embedding_model(input_repr="mel128", content_type="music",
                                                 embedding_size=6144)
​
for audio_file_path in list_of_file_paths:
    
    file_name = os.path.basename(audio_file_path)
    
    file_id = file_name.split('.')[0]
    
    if file_id not in list_of_npy_files:
        
        audio, sr  = sf.read(audio_file_path)
        
#         emb, ts = openl3.get_audio_embedding(audio, sr, center = False)
        emb, ts = openl3.get_audio_embedding(audio, sr, center = False, model=model)
        new_emb_file_name = file_id + ".npy"
        new_emb_file_path = os.path.join(data_write_dir, new_emb_file_name)
​
        with open(new_emb_file_path, 'wb+') as f:
            np.save(f, emb)
    else:
        print(file_id + '.npy already exists') 
Пример #15
0
def cal_deltas(X_in):
    X_out = (X_in[:, 2:, :] - X_in[:, :-2, :]) / 10.0
    X_out = X_out[:, 1:-1, :] + (X_in[:, 4:, :] - X_in[:, :-4, :]) / 5.0
    return X_out


for i in range(len(wavpath)):
    stereo, fs = sound.read(file_path + wavpath[i], stop=SampleDuration * sr)
    #logmel_data = np.zeros((NumFreqBins, NumTimeBins, num_channel), 'float32')
    #logmel_data[:,:, 0]= librosa.feature.melspectrogram(stereo[:], sr=sr, n_fft=NumFFTPoints, hop_length=HopLength, n_mels=NumFreqBins, fmin=0.0, fmax=sr/2, htk=True, norm=None)

    emb, ts = openl3.get_audio_embedding(stereo,
                                         sr,
                                         content_type="env",
                                         input_repr="mel256",
                                         embedding_size=512,
                                         hop_size=0.02,
                                         verbose=0)

    #logmel_data = np.log(logmel_data+1e-8)

    #deltas = cal_deltas(logmel_data)
    #deltas_deltas = cal_deltas(deltas)

    #feat_data = np.concatenate((logmel_data[:,4:-4,:], deltas[:,2:-2,:], deltas_deltas), axis=2)
    feat_data = emb
    feature_data = {
        'feat_data': feat_data,
    }