def check_model_for_regression(self, modelparams, filenames): audios = [] srs = [] for filename in filenames: audio, sr = sf.read(filename) audios.append(audio) srs.append(sr) n = len(filenames) embeddings0, ts0 = openl3.get_audio_embedding(audios, srs, batch_size=32, **modelparams) embeddings1, ts1 = openl3.get_audio_embedding(audios, srs, batch_size=32, **modelparams) # This is just a sanity check that openl3 # gives consistent results, we can remove # it later. for i in range(n): assert np.mean(np.abs(embeddings1[i] - embeddings0[i])) <= 1e-6 assert np.mean(np.abs(ts1[i] - ts0[i])) <= 1e-6 embeddings2, ts2 = torchopenl3.get_audio_embedding(audios, srs, batch_size=32, **modelparams) for i in range(n): ''' We increase the compare paremeter as kapre in openl3 and nnAudio in torchopenl3 giving more mean error. We can expect a prrety good result when we will pretrain model ''' assert np.mean(np.abs(embeddings1[i] - embeddings2[i])) <= 2 assert np.mean(np.abs(ts1[i] - ts2[i])) <= 2
def feature_extraction_l3(file_name): audio, sr = sf.read(file_name) emb, ts = openl3.get_audio_embedding(audio, sr, content_type='env', embedding_size=512) return emb
def check_model_for_regression(self, modelparams, filenames): audios = [] srs = [] for filename in filenames: audio, sr = sf.read(filename) audios.append(audio) srs.append(sr) n = len(filenames) embeddings0, ts0 = openl3.get_audio_embedding(audios, srs, batch_size=32, **modelparams) embeddings1, ts1 = openl3.get_audio_embedding(audios, srs, batch_size=32, **modelparams) # This is just a sanity check that openl3 # gives consistent results, we can remove # it later. for i in range(n): assert embeddings1[0].shape == embeddings0[0].shape assert embeddings1[1].shape == embeddings0[1].shape assert torch.mean( torch.abs(T(embeddings1[i]) - T(embeddings0[i]))) <= 1e-6 assert torch.mean(torch.abs(T(ts1[i]) - T(ts0[i]))) <= 1e-6 embeddings2, ts2 = torchopenl3.get_audio_embedding(audios, srs, batch_size=32, sampler="resampy", **modelparams) for i in range(n): """ We increase the compare paremeter as kapre in openl3 and nnAudio in torchopenl3 giving more mean error. We can expect a prrety good result when we will pretrain model """ print(embeddings1[0].shape, embeddings2[0].shape) print(embeddings1[1].shape, embeddings2[1].shape) print(torch.mean(torch.abs(T(ts1[i]) - T(ts2[i])))) print(torch.mean(torch.abs(T(embeddings1[i]) - T(embeddings2[i])))) print(torch.mean(torch.abs(T(ts1[i]) - T(ts2[i])))) assert embeddings1[0].shape == embeddings2[0].shape assert embeddings1[1].shape == embeddings2[1].shape assert torch.mean( torch.abs(T(embeddings1[i]) - T(embeddings2[i]))) <= 1e-2 assert torch.mean(torch.abs(T(ts1[i]) - T(ts2[i]))) <= 1e-6
def calculate(self, file_name): import openl3 audio = self.load_audio(file_name, change_sampling_rate=False) emb, ts = openl3.get_audio_embedding( audio, self.sr, model=self.openl3, hop_size=self.sequence_hop_time, verbose=False ) return emb
def test_get_audio_embedding_basic(input_repr, content_type, embedding_size, frontend, chirp_audio_sr): hop_size = 0.1 tol = 1e-5 # Make sure all embedding types work fine audio, sr = chirp_audio_sr emb1, ts1 = openl3.get_audio_embedding( audio, sr, input_repr=input_repr, content_type=content_type, embedding_size=embedding_size, center=True, hop_size=hop_size, verbose=True, frontend=frontend) assert np.all(np.abs(np.diff(ts1) - hop_size) < tol) assert emb1.shape[1] == embedding_size assert not np.any(np.isnan(emb1)) K.clear_session()
def extract_feature(self, wave_fp: str): x, sr = sf.read(wave_fp) #TODO: check x.shape # assert (x.shape == (441000,)) if x.shape[0] == sr * 10 - 1: x = np.append(x, 0) elif x.shape[0] == sr * 10 + 1: x = x[:-1] assert (x.shape == (sr * 10, )) # suppose audio are in 10s emb, ts = openl3.get_audio_embedding(x, sr, model=self.model, hop_size=self.hop_size) return np.expand_dims( emb, axis=0) # (1, xx, embedding_size), xx=96 when hop_size=0.1
classes = [ 'beach', 'bus', 'cafe/restaurant', 'car', 'city_center', 'forest_path', 'grocery_store', 'home', 'library', 'metro_station', 'office', 'park', 'residential_area', 'train', 'tram' ] input_path = '../../dataset/audio/' output = '../feat/audio/' for clas in classes: files = os.listdir(input_path + clas) for file in files: filePath = input_path + clas + '/' + file audio, sr = sf.read(filePath) #emb, ts = openl3.get_embedding(audio, sr) emb, ts = openl3.get_audio_embedding(audio, sr) outFileName = output + clas + '/' + file.split('.')[0] np.save(outFileName, emb) input_path = '../../dataset/background/' output = '../feat/background/' for clas in classes: files = os.listdir(input_path + clas) for file in files: filePath = input_path + clas + '/' + file audio, sr = sf.read(filePath) #emb, ts = openl3.get_embedding(audio, sr) emb, ts = openl3.get_audio_embedding(audio, sr) outFileName = output + clas + '/' + file.split('.')[0] np.save(outFileName, emb)
import openl3 import soundfile as sf # import tensorflow as tf import os os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if __name__ == "__main__": audio, sr = sf.read('/home/hw1-a07/dcase/datasets/TAU-urban-acoustic-scenes-2020-mobile-development/audio/airport-barcelona-0-3-a.wav') print(sr) print(audio) model = openl3.models.load_audio_embedding_model(content_type="music", input_repr="mel256", embedding_size=512) emb, ts = openl3.get_audio_embedding(audio, sr, model=model, hop_size=0.1) print(emb, ts) #openl3-music-mel256-emb512-hop0_1
def test_get_audio_embedding(chirp_audio_sr): hop_size = 0.1 tol = 1e-5 audio, sr = chirp_audio_sr emb1, ts1 = openl3.get_audio_embedding(audio, sr, input_repr="linear", content_type="env", embedding_size=512, center=True, hop_size=hop_size, verbose=True) assert np.all(np.abs(np.diff(ts1) - hop_size) < tol) assert emb1.shape[1] == 512 assert not np.any(np.isnan(emb1)) K.clear_session() # Make sure we can load a model and pass it in model = openl3.models.load_audio_embedding_model("linear", "env", 512) emb1load, ts1load = openl3.get_audio_embedding(audio, sr, model=model, center=True, hop_size=hop_size, verbose=True) assert np.all(np.abs(emb1load - emb1) < tol) assert np.all(np.abs(ts1load - ts1) < tol) # Make sure that the embeddings are approximately the same with mono and stereo audio, sr = sf.read(CHIRP_STEREO_PATH) emb2, ts2 = openl3.get_audio_embedding(audio, sr, model=model, center=True, hop_size=0.1, verbose=True) # assert np.all(np.abs(emb1 - emb2) < tol) # assert np.all(np.abs(ts1 - ts2) < tol) assert not np.any(np.isnan(emb2)) # Make sure that the embeddings are approximately the same if we resample the audio audio, sr = sf.read(CHIRP_44K_PATH) emb3, ts3 = openl3.get_audio_embedding(audio, sr, model=model, center=True, hop_size=0.1, verbose=True) # assert np.all(np.abs(emb1 - emb3) < tol) # assert np.all(np.abs(ts1 - ts3) < tol) assert not np.any(np.isnan(emb3)) # Make sure empty audio is handled audio, sr = sf.read(EMPTY_PATH) pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr, model=model, center=True, hop_size=0.1, verbose=True) # Make sure user is warned when audio is too short audio, sr = sf.read(SHORT_PATH) pytest.warns(OpenL3Warning, openl3.get_audio_embedding, audio, sr, model=model, center=False, hop_size=0.1, verbose=True) # Make sure short audio can be handled emb4, ts4 = openl3.get_audio_embedding(audio, sr, model=model, center=False, hop_size=0.1, verbose=True) assert emb4.shape[0] == 1 assert emb4.shape[1] == 512 assert len(ts4) == 1 assert ts4[0] == 0 assert not np.any(np.isnan(emb4)) # Make sure silence is handled audio, sr = sf.read(SILENCE_PATH) pytest.warns(OpenL3Warning, openl3.get_audio_embedding, audio, sr, model=model, center=True, hop_size=0.1, verbose=True) emb5, ts5 = openl3.get_audio_embedding(audio, sr, model=model, center=True, hop_size=0.1, verbose=True) assert emb5.shape[1] == 512 assert not np.any(np.isnan(emb5)) # Check for centering audio, sr = sf.read(CHIRP_1S_PATH) emb6, ts6 = openl3.get_audio_embedding(audio, sr, model=model, center=True, hop_size=hop_size, verbose=True) n_frames = 1 + int((audio.shape[0] + sr//2 - sr) / float(int(hop_size*sr))) assert emb6.shape[0] == n_frames emb7, ts7 = openl3.get_audio_embedding(audio, sr, model=model, center=False, hop_size=hop_size, verbose=True) n_frames = 1 + int((audio.shape[0] - sr) / float(int(hop_size*sr))) assert emb7.shape[0] == n_frames # Check for hop size hop_size = 0.2 emb8, ts8 = openl3.get_audio_embedding(audio, sr, model=model, center=False, hop_size=hop_size, verbose=True) n_frames = 1 + int((audio.shape[0] - sr) / float(int(hop_size*sr))) assert emb8.shape[0] == n_frames # Make sure changing verbosity doesn't break openl3.get_audio_embedding(audio, sr, model=model, center=True, hop_size=hop_size, verbose=False) # Check batch processing with multiple files with a single sample rate audio, sr = sf.read(CHIRP_MONO_PATH) hop_size = 0.1 emb_list, ts_list = openl3.get_audio_embedding([audio, audio], sr, model=model, center=True, hop_size=hop_size, batch_size=4) n_frames = 1 + int((audio.shape[0] + sr//2 - sr) / float(int(hop_size*sr))) assert len(emb_list) == 2 assert len(ts_list) == 2 assert emb_list[0].shape[0] == n_frames assert np.allclose(emb_list[0], emb_list[1]) assert np.allclose(ts_list[0], ts_list[1]) # Check batch processing with multiple files with individually given sample rates emb_list, ts_list = openl3.get_audio_embedding([audio, audio], [sr, sr], model=model, center=True, hop_size=hop_size, batch_size=4) n_frames = 1 + int((audio.shape[0] + sr//2 - sr) / float(int(hop_size*sr))) assert type(emb_list) == list assert type(ts_list) == list assert len(emb_list) == 2 assert len(ts_list) == 2 assert emb_list[0].shape[0] == n_frames assert np.allclose(emb_list[0], emb_list[1]) assert np.allclose(ts_list[0], ts_list[1]) # Check batch processing with multiple files with different sample rates emb_list, ts_list = openl3.get_audio_embedding([audio, audio], [sr, sr/2], model=model, center=True, hop_size=hop_size, batch_size=4) n_frames = 1 + int((audio.shape[0] + sr//2 - sr) / float(int(hop_size*sr))) n_frames_2 = 1 + int((audio.shape[0] + sr//4 - sr/2) / float(int(hop_size*sr/2))) assert type(emb_list) == list assert type(ts_list) == list assert len(emb_list) == 2 assert len(ts_list) == 2 assert emb_list[0].shape[0] == n_frames assert emb_list[1].shape[0] == n_frames_2 K.clear_session() # Make sure invalid arguments don't work pytest.raises(OpenL3Error, openl3.get_audio_embedding, "invalid", sr, input_repr="mel256", content_type="music", embedding_size=512, center=True, hop_size=0.1, verbose=True) pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr, model="invalid", center=True, hop_size=0.1, verbose=True) pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, [sr, sr], input_repr="mel256", content_type="music", embedding_size=512, center=True, hop_size=0.1, verbose=True) pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, "invalid", input_repr="mel256", content_type="music", embedding_size=512, center=True, hop_size=0.1, verbose=True) pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, -1, input_repr="mel256", content_type="music", embedding_size=512, center=True, hop_size=0.1, verbose=True) pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr, input_repr="invalid", content_type="music", embedding_size=512, center=True, hop_size=0.1, verbose=True) pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr, input_repr="mel256", content_type="invalid", embedding_size=512, center=True, hop_size=0.1, verbose=True) pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr, input_repr="mel256", content_type="invalid", embedding_size=42, center=True, hop_size=0.1, verbose=True) pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr, input_repr="mel256", content_type="music", embedding_size="invalid", center=True, hop_size=0.1, verbose=True) pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr, input_repr="mel256", content_type="music", embedding_size=512, center=True, hop_size=0, verbose=True) pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr, input_repr="mel256", content_type="music", embedding_size=512, center=True, hop_size=-1, verbose=True) pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr, input_repr="mel256", content_type="music", embedding_size=512, center=True, hop_size=0.1, verbose=-1) pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr, input_repr="mel256", content_type="music", embedding_size=512, center='invalid', hop_size=0.1, verbose=True) pytest.raises(OpenL3Error, openl3.get_audio_embedding, np.ones((10, 10, 10)), sr, input_repr="mel256", content_type="music", embedding_size=512, center=True, hop_size=0.1, verbose=True) pytest.raises(OpenL3Error, openl3.get_audio_embedding, audio, sr, frontend="invalid", center=True, hop_size=0.1, verbose=True)
def preprocess(self, batch_size=16): """ Outputs: Writes to disk the openl3 embedding pickle object for each sample. Optionally, it will output the entire matched X and Y numpy pickle objects if label path is provided - """ normalizer = Normalizer() tmp_output_folder = "" if self.video_folder.endswith(".zip"): # Unzips files to a temp directory tmp_output_folder = self.output_folder.rstrip('/') + "_tmp" print(f"Unzipping files to temp dir {tmp_output_folder}...") Path(f"{tmp_output_folder}").mkdir(parents=True, exist_ok=True) with zipfile.ZipFile(self.video_folder, 'r') as zip_ref: zip_ref.extractall(tmp_output_folder) print("Finished unzipping files") else: tmp_output_folder = self.video_folder print("Skipping unzipping files as input is a folder") Path(f"{self.output_folder}/audio-pickle/").mkdir(parents=True, exist_ok=True) # Strip the audio from video and store as .wav file video_files = sorted(glob.glob(tmp_output_folder + '/*.mp4')) video_files_split = np.array_split(np.asarray(video_files), len(video_files) // batch_size) target_labels = [] if self.label_path is not None: targets = [] target_labels = np.genfromtxt(self.label_path, delimiter=' ', dtype='str') sr = 0 all_x = [] maxlen = int(self.max_len // self.hop_size + 1) for i in range(0, len(video_files_split)): audio_reads = [] for f in video_files_split[i]: newname = os.path.basename(f) output_wav_file = newname + 'extracted_audio.wav' ffmpeg_extract_audio(f, "/tmp/" + output_wav_file) if self.label_path is not None: target_index = np.where( target_labels[:, 0] == newname[:-4])[0] target_index = int(target_index) target = int(target_labels[:, 1][target_index]) - 1 targets.append(target) audio_read, sr = sf.read("/tmp/" + output_wav_file) audio_reads.append(audio_read) print(f"Reading file {output_wav_file} ...") X_arr, ts_list = openl3.get_audio_embedding(audio_reads, sr, batch_size=15, hop_size=self.hop_size) X = tf.keras.preprocessing.sequence.pad_sequences(X_arr, maxlen=maxlen) X = np.asarray(X, dtype='float32') if i == 0: all_x = X all_x = np.asarray(all_x, dtype='float32') else: all_x = np.concatenate((all_x, X), axis=0) print(all_x.shape) all_x_norm = all_x for i in range(0, len(all_x_norm)): all_x_norm[i] = normalizer.fit_transform(all_x_norm[i]) for f in video_files: file_name = os.path.basename(f) with open( f"{self.output_folder}/audio-pickle/{file_name}-openl3.pkl", "wb") as f_out: pickle.dump(all_x_norm[i], f_out) if self.label_path is not None: with open(f"{self.output_folder}/audio-pickle-all-X-openl3.pkl", "wb") as f_out: pickle.dump(all_x_norm, f_out) targets = np.asarray(targets) with open(f"{self.output_folder}/audio-pickle-all-Y-openl3.pkl", "wb") as f_out: pickle.dump(targets, f_out) if self.output_file is not None: print(f"Starting to zip files to {self.output_file}") def zipdir(path, ziph): for root, dirs, files in os.walk(path): folder = root[len(path):] for file in files: ziph.write(join(root, file), join(folder, file)) zipf = zipfile.ZipFile(self.output_file, 'w', zipfile.ZIP_DEFLATED) zipdir(self.output_folder, zipf) zipf.close() print(f"Done zipping files to {self.output_file}") print("Done!")
audio_tsv = [] label_tsv = [] emb_tsv = [] audio_list, sr_list = [], [] for path in tqdm(paths, desc='Loading audio files ...'): audio, sr = sf.read(path) audio_list.append(audio) sr_list.append(sr) filename = Path(path).name label = Path(Path(path).parent).stem label_tsv.append([ontology_lookup[label]['name']]) audio_tsv.append([f'{label}/{filename}']) emb_list, _ = openl3.get_audio_embedding(audio_list, sr_list, content_type='env', hop_size=0.5) for emb in tqdm(emb_list, desc="Creating embeddings ..."): emb = np.mean(emb, axis=0).tolist() emb_tsv.append(emb) with open(f'{OUTPUTDIR}/emb.tsv', 'w') as f: for emb in emb_tsv: csv.writer(f, delimiter='\t').writerow(emb) with open(f'{OUTPUTDIR}/label.tsv', 'w') as f: for label in label_tsv: csv.writer(f, delimiter='\t').writerow(label) with open(f'{OUTPUTDIR}/audio.tsv', 'w') as f: for audio_path in audio_tsv: csv.writer(f, delimiter='\t').writerow(audio_path)
total = 0 for folder in \ os.listdir('/green-projects/project-sonyc_redhook/workspace/share/truck_audio/redhook_truck_audio/10s'): for file in os.listdir\ ('/green-projects/project-sonyc_redhook/workspace/share/truck_audio/redhook_truck_audio/10s/' + folder): total += 1 count = 0 for folder in \ os.listdir('/green-projects/project-sonyc_redhook/workspace/share/truck_audio/redhook_truck_audio/10s'): if(int(folder[8:10]) < 14): for file in os.listdir\ ('/green-projects/project-sonyc_redhook/workspace/share/truck_audio/redhook_truck_audio/10s/' + folder): audio_timestamp = int(file.split(".")[0]) audio, sr = sf.read('/green-projects/project-sonyc_redhook/workspace/share/truck_audio/' + 'redhook_truck_audio/10s/' + folder + '/' + file) emb, ts = openl3.get_audio_embedding(audio, sr, content_type='env', embedding_size=512) embedding = [emb, ts] embedding_list.append((audio_timestamp, embedding)) count += 1 print('done with ' + str(count) + ' out of ' + str(total)) sys.stdout.flush() print('done with all embeddings') sys.stdout.flush() with open('embedding_list.pickle', 'wb') as f: pickle.dump(embedding_list, f) print('done dumping embedding list') sys.stdout.flush()
def calculate_embeddings(self, max_items_per_class: Optional[int] = 1000, max_classes: Optional[int] = 1000, class_filter: Optional[Union[List, None]] = None, model: Optional[Union[Model, None]] = None, input_repr: Optional[str] = 'mel256', content_type: Optional[str] = 'music', embedding_size: Optional[int] = 6144, center: Optional[bool] = True, hop_size: Optional[float] = 0.1, batch_size: Optional[int] = 32, verbose: Optional[bool] = True) -> None: """ Initializes the calculation process of the OpenL3 embeddings. :param max_items_per_class: To speed up the calculation process, the amount of processed samples can be restricted. If None, all the samples in self.audio_paths_by_class will be processed. :param max_classes: To speed up the calculation process, the amount of classes can be restricted. If None, all the found classes will be processed. :param class_filter: If not None, only classes found in the given filter list will be loaded. The class filter can contain superclass or subclass labels. :param model: A custom model for calculating the embeddings. More information can be found in the OpenL3 docs. (get_audio_embeddings) :param input_repr: The input representation of the sample. Can be linear, mel128 or mel256. More information can be found in the OpenL3 docs. (get_audio_embeddings) :param content_type: The content type of the samples to be processed. Can be music or env. More information can be found in the OpenL3 docs. (get_audio_embeddings) :param embedding_size: The size of the calculated embeddings. Can be 512 or 6144. More information can be found in the OpenL3 docs. (get_audio_embeddings) :param center: The location of the returned timestamps. More information can be found in the OpenL3 docs. (get_audio_embeddings) :param hop_size: The hop size used to calculate the embeddings. More information can be found in the OpenL3 docs. (get_audio_embeddings) :param batch_size: The number of samples that are fed to the model at once. More information can be found in the OpenL3 docs. (get_audio_embeddings) :param verbose: The amount of information printed on the screen during the calculation procedure. :return: None """ if self.openl3settings is None: self.openl3settings = { 'input_repr': input_repr, 'content_type': content_type, 'embedding_size': embedding_size, 'center': center, 'hop_size': hop_size } # Initialize embedding container all_embeddings = OpenL3EmbeddingPackageWrapper() # Class counter keeps track of how many classes have been processed. class_counter = 0 for class_label in list(self.audio_paths_by_class.keys()): if class_filter is not None and class_label not in class_filter: self.log(f"Skipping class {class_label}") continue self.log(f"Processing class {class_label}") # Openl3 will process these lists to get the embeddings. audio_list = [] sr_list = [] # The package list keeps track of all the packages before being # added to the all_embeddings container. package_list = [] # Load audio samples and respective sample rates to lists self.log("Loading audio...") counter = 0 for audio_path in self.audio_paths_by_class[class_label]: audio, sr = lbr.load(audio_path, sr=None) # Important metadata needed to play the audio later on when # clicked on the plot. metadata = { 'class': class_label, 'sample_id': class_label + '_' + str(counter), 'raw_audio_path': audio_path, 'original_sr': sr, 'openl3settings': self.openl3settings } # A container package is initialized for each sample package = OpenL3EmbeddingPackage(embeddings=None, timestamps=None, metadata=metadata) audio_list.append(audio) sr_list.append(sr) package_list.append(package) if max_items_per_class is not None and \ counter >= max_items_per_class: break counter += 1 self.log("Computing embeddings...") # Here the embeddings are calculated with the OpenL3 model specified # in the arguments. emb_list, ts_list \ = openl3.get_audio_embedding(audio_list, sr_list, model=model, input_repr=input_repr, content_type=content_type, embedding_size=embedding_size, center=center, hop_size=hop_size, batch_size=batch_size, verbose=verbose) counter = 0 for embeddings in emb_list: package_list[counter].set_embeddings(embeddings) all_embeddings.add_package(package_list[counter]) counter += 1 if max_classes is not None and class_counter >= max_classes: break class_counter += 1 # The container holds all the computed embeddings. self.embedding_wrapper = all_embeddings
list_of_npy_files = [] for root, dirs, files in os.walk(data_write_dir): path = root.split(os.sep) for file in files: if file.endswith(".npy"): # print(os.path.basename(os.path.join(root, file))) list_of_npy_files.append(file.split('.')[0]) # print(list_of_npy_files) model = openl3.models.load_audio_embedding_model(input_repr="mel128", content_type="music", embedding_size=6144) for audio_file_path in list_of_file_paths: file_name = os.path.basename(audio_file_path) file_id = file_name.split('.')[0] if file_id not in list_of_npy_files: audio, sr = sf.read(audio_file_path) # emb, ts = openl3.get_audio_embedding(audio, sr, center = False) emb, ts = openl3.get_audio_embedding(audio, sr, center = False, model=model) new_emb_file_name = file_id + ".npy" new_emb_file_path = os.path.join(data_write_dir, new_emb_file_name) with open(new_emb_file_path, 'wb+') as f: np.save(f, emb) else: print(file_id + '.npy already exists')
def cal_deltas(X_in): X_out = (X_in[:, 2:, :] - X_in[:, :-2, :]) / 10.0 X_out = X_out[:, 1:-1, :] + (X_in[:, 4:, :] - X_in[:, :-4, :]) / 5.0 return X_out for i in range(len(wavpath)): stereo, fs = sound.read(file_path + wavpath[i], stop=SampleDuration * sr) #logmel_data = np.zeros((NumFreqBins, NumTimeBins, num_channel), 'float32') #logmel_data[:,:, 0]= librosa.feature.melspectrogram(stereo[:], sr=sr, n_fft=NumFFTPoints, hop_length=HopLength, n_mels=NumFreqBins, fmin=0.0, fmax=sr/2, htk=True, norm=None) emb, ts = openl3.get_audio_embedding(stereo, sr, content_type="env", input_repr="mel256", embedding_size=512, hop_size=0.02, verbose=0) #logmel_data = np.log(logmel_data+1e-8) #deltas = cal_deltas(logmel_data) #deltas_deltas = cal_deltas(deltas) #feat_data = np.concatenate((logmel_data[:,4:-4,:], deltas[:,2:-2,:], deltas_deltas), axis=2) feat_data = emb feature_data = { 'feat_data': feat_data, }