def test_channels(x, sr, num_channels, feature_set, feature_level): # create feature extractor for single channel fex = opensmile.Smile(feature_set, feature_level) y_mono = fex.process_signal(x, sr) # create feature extractor for multiple channels fex = opensmile.Smile( feature_set, feature_level, num_channels=num_channels, ) with pytest.raises(RuntimeError): fex.process_signal(x, sr) # channel mismatch x = np.repeat(x, num_channels, axis=0) y = fex.process_signal(x, sr) # assertions assert y_mono.shape[0] == y.shape[0] assert y_mono.shape[1] * fex.num_channels == y.shape[1] for c in range(num_channels): np.testing.assert_equal( y.values[:, c * fex.num_features:(c + 1) * fex.num_features], y_mono.values, )
def test_default(tmpdir, feature_set, feature_level): deltas = feature_level == opensmile.FeatureLevel.LowLevelDescriptors_Deltas if (feature_set in gemaps_family) and deltas: # deltas not available with pytest.raises(ValueError): opensmile.Smile(feature_set, feature_level) else: # create feature extractor if feature_set in deprecated_feature_sets: with pytest.warns(UserWarning): fex = opensmile.Smile(feature_set, feature_level) else: fex = opensmile.Smile(feature_set, feature_level) # extract features from file y = fex.process_file(pytest.WAV_FILE) # run SMILExtract from same file source_config_file = os.path.join( fex.default_config_root, opensmile.config.FILE_INPUT_CONFIG, ) if feature_set in gemaps_family: sink_config_file = os.path.join( fex.default_config_root, opensmile.config.FILE_OUTPUT_CONFIG_NO_LLD_DE, ) else: sink_config_file = os.path.join( fex.default_config_root, opensmile.config.FILE_OUTPUT_CONFIG, ) output_file = os.path.join(tmpdir, f'{feature_level.value}.csv') command = f'{pytest.SMILEXTRACT} ' \ f'-C {fex.config_path} ' \ f'-source {source_config_file} ' \ f'-I {pytest.WAV_FILE} ' \ f'-sink {sink_config_file} ' \ f'-{feature_level.value}_csv_output {output_file}' os.system(command) # read output of SMILExtract and compare df = pd.read_csv(output_file, sep=';') np.testing.assert_allclose(df.values[:, 1:], y.values, rtol=1e-6, atol=0) assert fex.num_features == len(df.columns) - 1 assert fex.feature_names == list(df.columns[1:])
def _extract_opensmile_features(audio, sr, features_set): smile = opensmile.Smile( feature_set=features_set, feature_level=opensmile.FeatureLevel.Functionals, ) return smile.process_signal(audio, sr)
def __init__(self, config: Optional[Any] = None): super().__init__(config=config) assert is_module_available( "opensmile" ), 'To use opensmile extractors, please "pip install opensmile" first.' import opensmile if isinstance(self.config.feature_set, str): self.feature_set = opensmile.FeatureSet[self.config.feature_set] else: self.feature_set = self.config.feature_set self.feature_level = opensmile.FeatureLevel(self.config.feature_level) self.smileExtractor = opensmile.Smile( feature_set=self.feature_set, feature_level=self.feature_level, sampling_rate=self.config.sampling_rate, options=self.config.options, loglevel=self.config.loglevel, logfile=self.config.logfile, channels=self.config.channels, mixdown=self.config.mixdown, resample=self.config.resample, num_workers=self.config.num_workers, verbose=self.config.verbose, )
def extract_gemaps_features(audio_filename): audio, sampling_rate = soundfile.read(audio_filename) # Convert 2 channel input to 1 channel if len(audio.shape) == 2: audio = (audio[:, 0] + audio[:, 1]) / 2 feature_extractor = opensmile.Smile( feature_set=opensmile.FeatureSet.eGeMAPSv01b) # Ideally we would want to use 50 ms windows, but they are too short for opensmile # Therefore we use 100 ms windows instead, then upsample the feature vector to the same length step = int(0.1 * sampling_rate) features = [ feature_extractor.process_signal(audio[start:start + step], sampling_rate).to_numpy() for start in range(0, len(audio) - step, step) ] # remove excess dimension features = np.asarray(features).squeeze() # Upsample so that we have the same length as we would with 50 ms windows cols = np.linspace(0, features.shape[0], endpoint=False, num=features.shape[0] * 2, dtype=int) features = features[cols] return features
def gemaps_time_featurize(wav_file): # initialize features and labels labels = list() features = list() # extract LLD smile_LLD = opensmile.Smile( feature_set=opensmile.FeatureSet.eGeMAPSv01b, feature_level=opensmile.FeatureLevel.LowLevelDescriptors, ) y_LLD = smile_LLD.process_file(wav_file) labels_LLD = list(y_LLD) for i in range(len(labels_LLD)): features.append(list(y_LLD[labels_LLD[i]])) labels.append(labels_LLD[i]) smile_LLD_deltas = opensmile.Smile( feature_set=opensmile.FeatureSet.eGeMAPSv01b, feature_level=opensmile.FeatureLevel.LowLevelDescriptors_Deltas, ) y_LLD_deltas = smile_LLD_deltas.process_file(wav_file) labels_LLD_deltas = list(y_LLD_deltas) for i in range(len(labels_LLD_deltas)): features.append(list(y_LLD_deltas[labels_LLD_deltas[i]])) labels.append(labels_LLD_deltas[i]) smile_functionals = opensmile.Smile( feature_set=opensmile.FeatureSet.eGeMAPSv01b, feature_level=opensmile.FeatureLevel.Functionals, ) y_functionals = smile_functionals.process_file(wav_file) labels_y_functionals = list(y_functionals) for i in range(len(labels_y_functionals)): features.append(list(y_functionals[labels_y_functionals[i]])) labels.append(labels_y_functionals[i]) return features, labels
def __init__(self, logfile="./log_opensmile"): """Init method for OpenSmileExtractor.""" super().__init__(logfile=logfile) self.smile = opensmile.Smile( # Create the functionals extractor here feature_set=opensmile.FeatureSet.eGeMAPSv02, feature_level=opensmile.FeatureLevel.Functionals, options={ "frameModeFunctionalsConf": "./data/custom_FrameModeFunctionals.conf.inc" # this local path might cause trouble }, )
def opensmileTrial(): signal, sampling_rate = audiofile.read("audio/1s.wav", always_2d=True) # wf = wave.open("audio/1s.wav", 'rb') # signal = wf.readframes(4096) smile = opensmile.Smile( feature_set='conf/alqudah_live.conf', feature_level='features', num_channels=2, ) print(signal.shape) result = smile.process_signal(signal[:, :4096], sampling_rate) print(result)
def liveTest(): wf = wave.open("audio/mono.wav", 'rb') print("samplewidth, nchannels, framerate, nframes") print(wf.getsampwidth()) #2 print(wf.getnchannels()) #2 (now - 1) print(wf.getframerate()) #44100 # sampling frequency print(wf.getnframes()) #441000 -> 10sec audio (True) # signal, sampling_rate = audiofile.read("audio/1s.wav", always_2d=True) smile = opensmile.Smile( feature_set='conf/alqudah_live.conf', feature_level='features', num_channels=1 #wf.getnchannels() ) c = C() p = pyaudio.PyAudio() def callback(in_data, frame_count, time_info, status): # print(frame_count) data = wf.readframes(frame_count) c.inc(np.frombuffer(data, dtype="int16") / pow(2, 15)) c.print() features = smile.process_signal( np.frombuffer(data, dtype="int16") / pow(2, 15), wf.getframerate()) c.appendFeatures(features) return (data, pyaudio.paContinue) # todo: set the format manually like "format=pyaudio.paInt16" stream = p.open( format=p.get_format_from_width(wf.getsampwidth()), channels=wf.getnchannels(), rate=wf.getframerate(), # input=True, output=True, stream_callback=callback, frames_per_buffer=int(wf.getframerate() / 10), # 0.1sec ) stream.start_stream() while stream.is_active(): time.sleep(0.1) print("Done") stream.stop_stream() stream.close() wf.close() p.terminate() c.plot() c.saveCSV()
def audioFeature(): s = time.time() smile2 = opensmile.Smile(feature_set=opensmile.FeatureSet.eGeMAPSv01b, feature_level=opensmile.FeatureLevel.Functionals, num_channels=2) smile6 = opensmile.Smile(feature_set=opensmile.FeatureSet.eGeMAPSv01b, feature_level=opensmile.FeatureLevel.Functionals, num_channels=6) directory_in_str = "MELD.Raw/dev_splits_complete" vec_528 = smile6.process_file(directory_in_str+"/dia1_utt5.mp4") vec_176 = smile2.process_file(directory_in_str+"/dia101_utt0.mp4") col_list = (vec_176.append([vec_528])).columns.tolist() X_dict = {} directory = os.fsencode(directory_in_str) i = 0 # files = sorted(os.listdir(directory)) for file in sorted(os.listdir(directory), key=lambda s: s.lower()): filename = os.fsdecode(file) # print(filename) try: feature_vec = smile6.process_file(directory_in_str + "/"+filename) X_dict[filename] = feature_vec except RuntimeError: feature_vec = smile2.process_file(directory_in_str + "/"+filename) feature_vec = feature_vec.reindex(columns=col_list, fill_value=0) X_dict[filename] = feature_vec except: continue i += 1 if i%100==0: print(i) pickle.dump(X_dict, open('dict/audio_features_dev.p', 'wb')) print(time.time()-s)
def verbose_opensmile(): for fset in opensmile.FeatureSet: print("========================================================") print(fset) for level in opensmile.FeatureLevel: print("==========================") print(level) try: smile = opensmile.Smile( feature_set=fset, feature_level=level, ) print(smile.feature_names) except: pass
def __init__(self, config_path="./config.yaml"): """Init method for the Searcher.""" super().__init__() # Load the configuration conf = OmegaConf.load(config_path) self.dataset_path = conf.dataset_path self.audio_path = os.path.join(conf.dataset_path, "podcasts-audio") self.es_url = conf.search_es_url # URL of Elasticsearch to query self.es_num = (conf.search_es_num ) # Number of segments to request from Elasticsearch self.sample_rate = 44100 # Hardcoded sample rate of all podcast audio # Load the podcast metadata self.metadata = load_metadata(self.dataset_path) # Set up the reranking model self.rerank_tokenizer = AutoTokenizer.from_pretrained( conf.search_rerank_model, use_fast=True, cache_dir=conf.search_cache_dir) self.rerank_model = AutoModelForSequenceClassification.from_pretrained( conf.search_rerank_model, cache_dir=conf.search_cache_dir) self.rerank_model.to("cpu", non_blocking=True) self.rerank_max_seq_len = 512 # Set up the openSMILE extractor self.smile = opensmile.Smile( feature_set=opensmile.FeatureSet.eGeMAPSv02, feature_level=opensmile.FeatureLevel.Functionals, options={ "frameModeFunctionalsConf": os.path.join( os.getenv("PODCAST_PATH"), "data/custom_FrameModeFunctionals.conf.inc", ) }, ) # Set up the YAMNet model params = yamnet_params.Params(sample_rate=self.sample_rate, patch_hop_seconds=0.48) self.yamnet_classes = yamnet_model.class_names( os.path.join(os.getenv("YAMNET_PATH"), "yamnet_class_map.csv")) self.yamnet_model = yamnet_model.yamnet_frames_model(params) self.yamnet_model.load_weights( os.path.join(os.getenv("PODCAST_PATH"), "data/yamnet.h5"))
def extract_opensmile_features_from_audio_sequence( data: Union[np.ndarray, str], sample_rate: Optional[int] = None, feature_type: str = 'LLD') -> np.ndarray: """Extracts opensmile ComParE_2016 features from audio sequence represented either by ndarray or path. https://github.com/audeering/opensmile-python :param data: np.ndarray or str Can be ndarray - already loaded sound data or str - path to data :param sample_rate: Optional[int] Sample rate is needed if data in ndarray format is provided :return: np.ndarray extracted features """ supported_feature_types = ('LLD', 'Compare_2016_functionals', 'EGEMAPS') if not feature_type in supported_feature_types: raise AttributeError('feature_type must the value from %s. Got %s.' % (supported_feature_types, feature_type)) # configure opensmile to extract desirable features if feature_type == 'LLD': feature_level = opensmile.FeatureLevel.LowLevelDescriptors feature_set = opensmile.FeatureSet.ComParE_2016 elif feature_type == 'Functionals': feature_level = opensmile.FeatureLevel.Functionals feature_set = opensmile.FeatureSet.ComParE_2016 elif feature_type == 'EGEMAPS': feature_level = opensmile.FeatureLevel.Functionals feature_set = opensmile.FeatureSet.eGeMAPSv02 # create opensmile Extractor smile = opensmile.Smile( feature_set=feature_set, feature_level=feature_level, ) # check if data is a valid type and if yes, extract features properly if isinstance(data, str): extracted_features = smile.process_file(data).values elif isinstance(data, np.ndarray): # check if audio data is one-channel, then reshape it to 1D array (the requirement of opensmile) if len(data.shape) == 2 and data.shape[1] == 1: data = data.reshape((-1, )) extracted_features = smile.process_signal( data, sampling_rate=sample_rate).values else: raise AttributeError('Data should be either ndarray or str. Got %s.' % (type(data))) return extracted_features
def get_feature_list(data_frame,audio_featurizer="mel_spectogram",text_featurizer="sentence_embeddings",checkpoint="roberta-large-nli-stsb-mean-tokens"): X_acoustic=[] Y=[] texts=[] if audio_featurizer=="opensmile": import opensmile smile = opensmile.Smile( feature_set=opensmile.FeatureSet.ComParE_2016, feature_level=opensmile.FeatureLevel.Functionals, ) for i in range(len(data_frame)): if audio_featurizer=="mel_spectogram": raw_mel=extract_mel_spectogram(data_frame.iloc[i]['audio_file_path'],mel=True) X_acoustic.append(raw_mel) elif audio_featurizer=="opensmile": y = smile.process_file(model_train_DataFrame.iloc[i]['audio_file_path']) X_acoustic.append([y.iloc[0][col] for col in y.columns]) Y.append(data_frame.iloc[i]['tag']) texts.append(data_frame.iloc[i]['text']) if text_featurizer=="sentence_embeddings": from sentence_transformers import SentenceTransformer if checkpoint is None: embedder = SentenceTransformer('roberta-large-nli-stsb-mean-tokens') else: embedder = SentenceTransformer(checkpoint) X_textual= embedder.encode(texts) X_acoustic_padded=pad_sequences(X_acoustic,padding='post') le=LabelEncoder() Y=le.fit_transform(Y) if len(X_acoustic_padded[0].shape)>1: X_acoustic_shape=(None,X_acoustic_padded[0].shape[1]) else: X_acoustic_shape=(X_acoustic_padded[0].shape[0]) print(X_acoustic_shape) if len(X_textual[0].shape)>1: X_textual_shape=(None,X_textual[0].shape) else: X_textual_shape=(X_textual[0].shape[0]) if (audio_featurizer=="mel_spectogram") and text_featurizer=="sentence_embeddings": model=get_model(X_textual_shape,X_acoustic_shape,n_classes=len(le.classes_),acoustic_sequence=True) elif audio_featurizer=="opensmile" and text_featurizer=="sentence_embeddings": model=get_model(X_textual_shape,X_acoustic_shape,n_classes=len(le.classes_),acoustic_sequence=False) return [np.array(X_textual),np.array(X_acoustic_padded)],np.array(Y),le,model
def test_signal(file, feature_set, feature_level): # create feature extractor fex = opensmile.Smile(feature_set, feature_level) # extract from numpy array x, sr = audiofile.read(file, always_2d=True) y = fex.process_signal(x, sr) y_file = fex.process_file(file) with pytest.warns(UserWarning): y_empty = fex.process_signal(x[0, :10], sr) # assertions assert fex.feature_names == y.columns.to_list() np.testing.assert_equal(y.values, y_file.values) assert all(y_empty.isna())
def mid_term_feat_extraction(wav_file_path): sampling_rate, signal = audioBasicIO.read_audio_file(wav_file_path) if sampling_rate == 0: print('Sampling rate not correct.') return None signal = audioBasicIO.stereo_to_mono(signal) if signal.shape[0] < float(sampling_rate) / 5: print("The duration of the audio is too short.") return None mid_window, mid_step, short_window, short_step = 0.5, 0.5, 0.05, 0.05 mid_features, _, mid_feature_names = MidTermFeatures.mid_feature_extraction( signal, sampling_rate, round(mid_window * sampling_rate), round(mid_step * sampling_rate), round(sampling_rate * short_window), round(sampling_rate * short_step)) mid_features = np.transpose(mid_features) mid_features = mid_features.mean(axis=0) # long term averaging of mid-term statistics if (not np.isnan(mid_features).any()) and ( not np.isinf(mid_features).any()): #print('Mid-Terms features extracted correctly.') mid_dict = dict(zip(mid_feature_names, mid_features)) mid_df = pd.DataFrame([mid_dict.values()], columns=mid_dict.keys()) # Smile library audio extraction smile = opensmile.Smile( feature_set=opensmile.FeatureSet.eGeMAPSv01b, feature_level=opensmile.FeatureLevel.Functionals, ) smile_features = smile.process_signal(signal, sampling_rate) smile_df = pd.DataFrame(smile_features).reset_index().iloc[:, 2:] final_df = pd.concat([mid_df, smile_df], axis=1) #excel_path = wav_file_path.strip('.') + 'features_extracted.xlsx' #final_df.to_excel(excel_path) return final_df else: #print('Mid-Terms features extracted incorrectly.') return None
def test_files(num_files, feature_set, feature_level, num_workers): # create feature extractor fex = opensmile.Smile( feature_set, feature_level, num_workers=num_workers, ) # extract from single file y_file = fex.process_file(pytest.WAV_FILE) # extract from files y_files = fex.process_files([pytest.WAV_FILE] * num_files) # assertions np.testing.assert_equal(np.concatenate([y_file] * num_files), y_files.values)
def test_custom(config, level): # create feature extractor fex = opensmile.Smile(config, level) # extract from file y_file = fex.process_file(pytest.WAV_FILE) # extract from array x, sr = audiofile.read(pytest.WAV_FILE) y_array = fex.process_signal(x, sr, file=pytest.WAV_FILE) # assertions assert fex.config_name == audeer.basename_wo_ext(config) assert fex.config_path == audeer.safe_path(config) assert fex.num_features == len(fex.feature_names) assert fex.feature_names == y_file.columns.to_list() pd.testing.assert_frame_equal(y_file, y_array)
'6d8998ea8704af6685a50219c9dd3747', '2f8ddcfcb5a4419b16c533808ed9c38e', '57e7a29c01c11136336257b324b7a3af', 'f0132034f1bc1a891841d6e8bbb6eb1a', 'd50f6c0da77b2223568ad042ea035a5e', '28081878fb0d5dbaac595fb031bb4e43', '5b67a12483bb2bc7eed1b214398bac9c', 'ae54788ccd8cc8b67e121f49e747f548', '9a3614ec10dcaeb5ee53088d939bbd6b', '56a336e76f530ef57ae1e7f0a156e8c0', '3ec0eea351847a9def135aa2b322637f', '9ea4000febe9fab7c8560437103192c6', '43bdf492e2b6bda5ec869dc003de81c1', '1e996fca7223c00c4c76b1f3b7dc1eaa', '2eef8025f3f03c9ad7532249808ab4f8' ] file = accepted[0] filepath = "./response1/" + file + ".wav" signal, sampling_rate = audiofile.read(filepath, always_2d=True) smile = opensmile.Smile( feature_set=opensmile.FeatureSet.ComParE_2016, # feature_set=opensmile.FeatureSet.eGeMAPSv02, feature_level=opensmile.FeatureLevel.Functionals, ) # print(smile.feature_names) data = smile.process_signal(signal, sampling_rate) data.insert(loc=0, column='key', value=file) # %% ''' FOLLOWING FILES ''' ######################################################### for i in range(1, len(accepted)):
client.connect(broker_address, broker_port, keepalive) client.loop_start() #start the loop # %% reload saved model # load model from file with open('../Network/model_smile_it.json', 'r') as json_file: loaded_json = json_file.read() model = model_from_json(loaded_json, custom_objects={'TCN': TCN}) # restore weights model.load_weights('../Network/weights_smile_it.h5') # %% Pre-process input # Config for opensmile feature set smile = opensmile.Smile( feature_set=opensmile.FeatureSet.eGeMAPSv02, feature_level=opensmile.FeatureLevel.LowLevelDescriptors, ) def input_prep(data, smile): X_smile = np.empty(shape=(1, 296, 25)) df_x = smile.process_signal(data, 44100) scaler = MinMaxScaler() X_smile[0, :, :] = scaler.fit_transform(df_x.values) return X_smile # %% Identificar dispositivos de audio do sistema p = pyaudio.PyAudio() info = p.get_host_api_info_by_index(0) numdevices = info.get('deviceCount')
'-o', required=True, help='output feature file') parser.add_argument('--feat', '-x', required=True, choices=['eGeMAPS', 'xvector'], help='Feature type') args = parser.parse_args() file_list = get_files(args.file) if args.feat == 'eGeMAPS': smile = opensmile.Smile( feature_set=opensmile.FeatureSet.eGeMAPSv02, feature_level=opensmile.FeatureLevel.Functionals) # write 1 example to include header get_feat_smile(file_list[0], args.out, smile, header=True) # generate iterable with arguments to func iterable = zip(file_list[1:], repeat(args.out), repeat(smile)) func = get_feat_smile elif args.feat == 'xvector': get_feat_xvector(file_list[0], args.out, header=True) iterable = zip(file_list[1:], repeat(args.out)) func = get_feat_xvector