def parseAllFeatures(self, indices, filenames): returnList = [] returnLabels = [] tot = np.zeros(len(indices)) num = 0 for el in filenames: classname = el.split('/')[-1].strip() # print (el, classname) try: [Fs, x] = audioBasicIO.readAudioFile(el) except ValueError: continue F = None if len(x.shape) == 1: F = audioFeatureExtraction.stFeatureExtraction( x, Fs, 0.050 * Fs, 0.025 * Fs) else: F = audioFeatureExtraction.stFeatureExtraction( x[:, 0], Fs, 0.050 * Fs, 0.025 * Fs) tot += np.mean(F[indices, :], axis=1) num += 1 returnList.append(F[indices, :].T) if classname[0] == 'a': returnLabels.append(0) elif classname[0] == 'd': returnLabels.append(1) elif classname[0] == 'f': returnLabels.append(2) elif classname[0] == 'h': returnLabels.append(3) elif classname[0] == 'n': returnLabels.append(4) elif classname[0:2] == 'sa': returnLabels.append(5) else: returnLabels.append(6) returnListLength = len(returnList) random.seed(13921) shuffledIndices = random.sample(range(returnListLength), returnListLength) shuffledReturnList = [returnList[i] for i in shuffledIndices] shuffledReturnLabels = [returnLabels[i] for i in shuffledIndices] return shuffledReturnList, shuffledReturnLabels
def extract_dataset(self, data, nb_samples, dataset, save=True): f_global = [] i = 0 for (x, Fs) in data: # 34D short-term feature f = audioFeatureExtraction.stFeatureExtraction(x, Fs, globalvars.frame_size * Fs, globalvars.step * Fs) # Harmonic ratio and pitch, 2D hr_pitch = audioFeatureExtraction.stFeatureSpeed(x, Fs, globalvars.frame_size * Fs, globalvars.step * Fs) f = np.append(f, hr_pitch.transpose(), axis=0) # Z-normalized f = stats.zscore(f, axis=0) f = f.transpose() f_global.append(f) sys.stdout.write("\033[F") i = i + 1 print("Extracting features " + str(i) + '/' + str(nb_samples) + " from data set...") f_global = sequence.pad_sequences(f_global, maxlen=globalvars.max_len, dtype='float32', padding='post', value=globalvars.masking_value) if save: print("Saving features to file...") cPickle.dump(f_global, open(dataset + '_features.p', 'wb')) return f_global
def extract(x, sr=16000): f_global = [] # 34D short-term feature f = audioFeatureExtraction.stFeatureExtraction(x, sr, globalvars.frame_size * sr, globalvars.step * sr) # for pyAudioAnalysis which support python3 if type(f) is tuple: f = f[0] # Harmonic ratio and pitch, 2D hr_pitch = audioFeatureExtraction.stFeatureSpeed( x, sr, globalvars.frame_size * sr, globalvars.step * sr) f = np.append(f, hr_pitch.transpose(), axis=0) # Z-normalized f = stats.zscore(f, axis=0) f = f.transpose() f_global.append(f) f_global = sequence.pad_sequences(f_global, maxlen=globalvars.max_len, dtype='float32', padding='post', value=globalvars.masking_value) return f_global
def extract_features(path_file, frame_size=25e-3, frame_stride=10e-3): """Function to combine logmel and frame level ST features extracted using pyAudioAnalysis Library Output: 40+22 = 62 dim logmel+ST features """ [sample_rate, signal] = audioBasicIO.readAudioFile(path_file) frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate # Convert from seconds to samples # signal_length = len(emphasized_signal) frame_length = int(round(frame_length)) frame_step = int(round(frame_step)) st_features = audioFeatureExtraction.stFeatureExtraction( signal, sample_rate, frame_length, frame_step) filter_banks = extract_logmel(path_file, frame_size=25e-3, frame_stride=10e-3, normalize=False) st_features = np.transpose( st_features) # transpose to make frame_count as x-axis st_features = np.delete(st_features, np.s_[8:21], axis=1) # delete the MFCCs if st_features.shape[0] - filter_banks.shape[0] == 1: st_features = st_features[:-1, :] # print (st_features.shape[0], filter_banks.shape[0]) features = np.c_[st_features, filter_banks] features -= (np.mean(features, axis=0) + 1e-8) return features
def get_st_features(signal, rate, window_step=0.025, window_length=0.05): """Computes all 34 features for each window in a given signal Parameters ---------- signal : numpy array All sample points for the audio signal Can be any type of number rate : int Sample rate of the audio signal, in Hz window_step : float Time step between each successive window, in seconds Default: 0.025 (25 ms) window_length : float Length of each window, in seconds Should generally be greater than windowStep to allow for overlap between frames Default: 0.05 (50 ms) Returns ---------- features : numpy array NumPy array of size (number of windows) * 34 Each row in mfcc_features contains all the features for a single frame feature_names : [str] Names of each feature located at specified index""" sample_step = int(rate * window_step) sample_length = int(rate * window_length) (features, feature_names) = audioFeatureExtraction.stFeatureExtraction( signal, rate, sample_length, sample_step) return features, feature_names
def load_validation_set(): """ Output a tuple of features: (fft features, mfcc features, mean-std features) Description extracts three types of features from validation set. """ ffts = dict() mfccs = dict() mean_stds = dict() for i in validation_ids: path = './validation/validation.{i}.wav'.format(i=i) _, X = read_wav(path) # FFT fft = np.array(abs(sp.fft(X)[:1000])) ffts.update({i: fft}) # MFCC ceps, mspec, spec = mfcc(X) num_ceps = len(ceps) x = np.mean(ceps[int(num_ceps*1/10):int(num_ceps*9/10)], axis=0) mfccs.update({i: x}) # Mean-Std [Fs, x] = audioBasicIO.readAudioFile(path); F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs); mean_std = [] for f in F: mean_std.extend([f.mean(), f.std()]) mean_stds.update({i: np.array(mean_std)}) return (ffts, mfccs, mean_stds)
def pitchProc(self): print('pitchProc = ' + (self.fname)) [Fs, x] = audioBasicIO.readAudioFile(self.fname) info = audioFeatureExtraction.stFeatureExtraction( x, Fs, 0.050 * Fs, 0.025 * Fs) print(len(x)) return info[0][1]
def classifyEmotion(filePath): print("[INFO] Loading sound file") [Fs, x] = audioBasicIO.readAudioFile(filePath) x = audioBasicIO.stereo2mono(x) features, _ = audioFeatureExtraction.stFeatureExtraction( x, Fs, FRAME_SIZE * Fs, FRAME_SIZE / 2 * Fs) inputArray = np.expand_dims(features, axis=3) first_layer = model.get_layer(index=0) required_input_shape = first_layer.get_config()['batch_input_shape'][1:] # Adjust input to match required shape if required_input_shape[1] > inputArray.shape[1]: zerosArray = np.zeros( (required_input_shape[0], required_input_shape[1] - inputArray.shape[1], 1), dtype=inputArray.dtype) inputArray = np.concatenate((inputArray, zerosArray), axis=1) else: inputArray = inputArray[:, :required_input_shape[1], :] print("[INFO] classifying sound...") proba = model.predict(np.expand_dims(inputArray, axis=0))[0] idx = np.argmax(proba) label = lb.classes_[idx] label_with_predictions = {} for i in range(len(proba)): label_with_predictions[lb.classes_[i]] = proba[i] print("[INFO] Probabilities:", label_with_predictions) print("[INFO] Prediction {}".format(label)) return label
def extract_features3(self,Fs,x): x = audioBasicIO.stereo2mono(x) # necessary conversion for pyaudio analysis #print len(x) # they must be 24k samples #coef = int(np.floor(len(x)/48000)) #x = x[range(0,len(x),6)] #print len(x) # Fs=16000 features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs) if len(features) == 0: features = np.zeros((34, 2)) features_mean = np.mean(features, axis=1) features_std = np.std(features, axis=1) features_kurtosis = kurtosis(features, axis=1) features_skew = skew(features, axis=1) vec4moments = np.append(np.append(np.append(features_mean, features_std), features_kurtosis), features_skew) result = np.asarray(vec4moments).reshape(len(vec4moments), -1).transpose() #print(np.shape(result)) # features_complete = np.append(features_complete, features, axis=0) return result#vec4moments # _complete
def extract_features(dataset): data = dataset.data nb_samples = len(dataset.targets) frame_size = dataset.frame_size step = dataset.step f_global = [] i = 0 for (x, Fs) in data: # 34D short-term feature f = audioFeatureExtraction.stFeatureExtraction(x, Fs, frame_size * Fs, step * Fs) # for pyAudioAnalysis which support python3 if type(f) is tuple: f = f[0] # Harmonic ratio and pitch, 2D hr_pitch = audioFeatureExtraction.stFeatureSpeed(x, Fs, frame_size * Fs, step * Fs) f = np.append(f, hr_pitch.transpose(), axis=0) # Z-normalized f = stats.zscore(f, axis=0) f = f.transpose() f = np.mean(f, axis=0) f_global.append(f) sys.stdout.write("\033[F") i = i + 1 print("\t Extracting features " + str(i) + '/' + str(nb_samples) + " from data set...") return f_global
def extract_features(path): print 'extract feature of test set' test_pkl = 'test34.pkl' if os.path.isfile(test_pkl): [test_set, list_filenames] = cPickle.load(open(test_pkl, 'rb')) return test_set, list_filenames test_set = [] list_filenames = sorted(os.listdir(path)) for filename in list_filenames: path_to_file = os.path.join(path, filename) [rate, sig] = audioBasicIO.readAudioFile(path_to_file) if (rate == -1 and sig == -1): #convert to wav #command = "ffmpeg -i {}".format(path_to_file) extension = os.path.splitext(filename)[-1] new_file = path_to_file.replace(extension, '.wav') command = "ffmpeg -i {} {}".format(path_to_file, new_file) os.system(command) [rate, sig] = audioBasicIO.readAudioFile(new_file) os.system('rm {}'.format( path_to_file)) #remove old file not in *.wav format if sig.ndim >= 2: #merge multichannels into mono channel sig = np.mean(sig, axis=1) features = audioFeatureExtraction.stFeatureExtraction( sig, rate, win * rate, step * rate) features = features.reshape((features.shape[1], -1)) test_set.append(features) cPickle.dump([test_set, list_filenames], open(test_pkl, 'wb'), -1) return test_set, list_filenames
def pitchProc2(self, results_dict): print("pitchProc2") [Fs, x] = audioBasicIO.readAudioFile(self.fname) info = audioFeatureExtraction.stFeatureExtraction( x, Fs, 0.050 * Fs, 0.025 * Fs) results_dict["pitch"] = info[0][1] return info[0][1]
def newFeatures(inFile, isVAD): # read audio data from file #[Fs, x] = aIO.readAudioFile(inFile) x, Fs = librosa.load(inFile, sr=16000) #sf.read(inFile) # window and overlap size win = int(0.025 * Fs) step = int(0.010 * Fs) # get short-time features Feats = aFE.stFeatureExtraction(x, Fs, win, step) if isVAD: energy = Feats[1] thv = energy.mean() * 0.1 i_speechs = np.where(energy > thv)[0] Feats = Feats[:, i_speechs] # saveFeats(Feats) Feats = np.transpose(Feats[8:21, :]) newFeat = [] for row in range(len(Feats)): newFeat.append(Feats[row, :]) return newFeat
def predict_genre(song_path): optimal_training_features = load(OPTIMAL_TRAINING_FEATURES_DIR) training_data = load(OPTIMAL_TRAINING_FEATURES_DATA_DIR).item() X_train = training_data['X_train'] y_train = training_data['y_train'] x_test = [] Fs, x = audioBasicIO.readAudioFile(song_path) F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs) features_to_extract = sorted(optimal_training_features, key=itemgetter('feature_id')) for feature in features_to_extract: feature_index = feature['feature_id'] - 1 feature_prop = feature['feature_prop'] feature_data = None if feature_prop == 'min': feature_data = min(F[feature_index, :]) elif feature_prop == 'mean': feature_data = mean(F[feature_index, :]) elif feature_prop == 'max': feature_data = max(F[feature_index, :]) x_test.append(feature_data) X_train_scale, x_test_scale = scale_data_multiple(X_train, x_test) prediction = k_nearest_neighbor.k_nearest_neighbor(X_train_scale, y_train, x_test_scale, training_data['k']) return prediction
def getClassIDsToClipFramesMFCCs(classIDsToClipsMap): classToMFCCsOfClips = {} for classID in classIDsToClipsMap: classToMFCCsOfClips[classID] = [] for singleEventAudioClip in classIDsToClipsMap[classID]: # this returns a matrix that looks like: """ zcr | energy | energy_entropy | .... | mfcc_0 | mfcc_1 | ... | ... | win0 value value ---- win1 value ---- . . . """ # so featuresMatrix[0] = a list of the zcr for each window, and featuresMatrix[0][0] # would be the zcr for the very first window # we care about featuresMatrix[8:20] for the 13 MFCCs featuresMatrix, featureNames = fe.stFeatureExtraction( singleEventAudioClip, sf, window_size, step_size) mfccsForEachWindow = [] # for each window, loop through indicies 8-20 to get all of its mfccs into one list, and compile all of # that into a bigger list for i in range(0, len(featuresMatrix[8])): mfccsForOneWindow = [] for j in range(8, 21): mfccsForOneWindow.append(featuresMatrix[j][i]) mfccsForEachWindow.append(mfccsForOneWindow) # have list that looks like [ [mfcc0, mfcc1, mfcc2...], [mfcc0...], ... ] for each window in one clip classToMFCCsOfClips[classID].append(mfccsForEachWindow) return classToMFCCsOfClips
def extractFeatures(): files = [] print "Getting example names..." with open('./labels') as f: for line in f: files.append("_".join(line.split("\t")[0].split("_")[:-1]) + '.wav') f.close() print "Walking through files to extract features..." for dirpath, dirnames, filenames in os.walk(CLIPS_DIR_PATH): for f in filenames: if f in files: print "Extracting for", f path = './clips/clips/' + f audiofile = AudioSegment.from_file(path) data = np.fromstring(audiofile._data, np.int16) Fs = audiofile.frame_rate x = [] for chn in xrange(audiofile.channels): x.append(data[chn::audiofile.channels]) x = np.array(x).T if x.ndim == 2: if x.shape[1] == 1: x = x.flatten() try: features = audioFeatureExtraction.stFeatureExtraction( x, Fs, 0.050 * Fs, 0.025 * Fs).T np.save(FEATURE_PICKLES + f, features) except ValueError as e: print e
def process_request(): data = request.get_json(force=True) feature_type = data['feat_type'] assert feature_type in ['raw', 'mfcc', 'all'] model_input = data['m_in'] y = pd.read_json(data['feat']).as_matrix().squeeze() # get the sampled raw signal if feature_type == 'raw': sr = data['sr'] if model_input == 'mfcc': feat = librosa.feature.mfcc(y=y, sr=int(sr), n_mfcc=40).T # (S, 40) feat = th.FloatTensor(feat).unsqueeze(0) elif model_input == 'all': all_feats, f_names = audioFeatureExtraction.stFeatureExtraction( y, sr, 2048, 512) feat = th.FloatTensor(all_feats.T).unsqueeze(0) # get the mfcc directly elif feature_type == 'mfcc': feat = th.FloatTensor(y).view(-1, 40).unsqueeze(0) elif feature_type == 'all': feat = th.FloatTensor(y).view(-1, 34).unsqueeze(0) # load model model = SER(h_size=200, feat_size=feat.size(2), class_num=4, dropout=0.) #model.cuda() model.eval() #model.load_state_dict(th.load('checkpoint/model.pt')) pred = model(feat, [feat.size(1)], None, None) pred = pred.max(dim=1)[1].item() return str(pred)
def __init__(self, filename): [Fs, x] = audioBasicIO.readAudioFile(filename) F = audioFeatureExtraction.stFeatureExtraction( np.mean(x, axis=1) if x.ndim == 2 else x, Fs, 0.050 * Fs, 0.025 * Fs) # print (F[0][1]) self.input_from_audio = F[0][1]
def build_MFCC_for_one_sound_slice(folder, sound_slice): ''' builds the MFCC coeffs, given the sound_slice and folder containing the sound_slice :param file: str :param sound_slice: :return: array folder is the folder name where sound_slice is sound_slice must be in .wav format, not .mp3 format, in order to apply pyAusioAnalysis ''' sound_slice_fullname = os.path.join(folder, sound_slice) [Fs, x] = audioBasicIO.readAudioFile(sound_slice_fullname) F, f_names = audioFeatureExtraction.stFeatureExtraction( x, Fs, 0.050 * Fs, 0.025 * Fs) G = resample_matrix(F, num=15) # see the definitionog resample_matrix above #feat_list=[] #feat_list.append(G) '''Below, we extract the MFCC's from the extracted and then resampled features above .For each i, MFCC_list[i] gives for the i-th audio, all 13 feature vectors, each of them 15-dimensional resampled values of the original feature vectors''' MFCC_list = [] #for i in range(len(feat_list)): #MFCC = feat_list[i][8:21, :] # 9th to 21-st features are the MFCC coeffs MFCC = G[8:21, :] # 8th to 20 th features are the MFCC coeffs MFCC_flat = np.ndarray.flatten( MFCC ) # flatening the array, but are we destroying time series structure? MFCC_flattned_as_list = list( MFCC_flat ) #MFCC_array was np array, so we convert it into list, to avoid dim like [1, foo, bar] #MFCC_list.append(MFCC_flat_as_list) #MFCC_array = np.asarray(MFCC_list) return MFCC_flattned_as_list
def extract_mfcc_features(filename): print(filename) [Fs, x] = audioBasicIO.readAudioFile(filename) x = audioBasicIO.stereo2mono(x) F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs) mfcc = F[8:21, :] # 13 mfcc features return mfcc
def beatExtractionWrapper(wav_file, plot): if not os.path.isfile(wav_file): raise Exception("Input audio file not found!") [fs, x] = audioBasicIO.readAudioFile(wav_file) F, _ = aF.stFeatureExtraction(x, fs, 0.050 * fs, 0.050 * fs) bpm, ratio = aF.beatExtraction(F, 0.050, plot) print("Beat: {0:d} bpm ".format(int(bpm))) print("Ratio: {0:.2f} ".format(ratio))
def read_files(files): X = [] for fn in tqdm(files): y, sr = librosa.load(fn, sr=8000) y = preprocess(y) features = audioFeatureExtraction.stFeatureExtraction(y, sr, 0.10*sr, .05*sr) X.extend(features) return X
def featureExtractor(fileName): [Fs, x] = audioBasicIO.readAudioFile(fileName) Features = audioFeatureExtraction.stFeatureExtraction(x, Fs,0.001 * Fs, 0.0003 * Fs) MFCCs = [] for index in range(len(Features)): MFCCs.append(float(np.mean(Features[index]))) return MFCCs
def beatExtractionWrapper(wavFileName, plot): if not os.path.isfile(wavFileName): raise Exception("Input audio file not found!") [Fs, x] = audioBasicIO.readAudioFile(wavFileName) F = aF.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.050 * Fs) BPM, ratio = aF.beatExtraction(F, 0.050, plot) print("Beat: {0:d} bpm ".format(int(BPM))) print("Ratio: {0:.2f} ".format(ratio))
def getStVectorPerWav(wavFile, stWin, stStep): # given a wav, get entire sT features [Fs, x] = getTotalAudio([wavFile]) ShortTermFeatures = aF.stFeatureExtraction(x, Fs, stWin * Fs, stStep * Fs) [featuresNormSS, MEANSS, STDSS ] = aT.normalizeFeatures([ShortTermFeatures]) # normalize to 0-mean 1-std [X, y] = featureListToVectors([featuresNormSS]) return X, y, Fs
def _get_feature(file_path): """使用pyAudioAnalysis获取特征名称和对应的特征值""" # [fs, x] = audioBasicIO.readAudioFile(file_path) sr, x = read(file_path) x = audioBasicIO.stereo2mono(x) f, f_names = audioFeatureExtraction.stFeatureExtraction(x, sr, 0.050 * sr, 0.025 * sr) return f_names, [np.mean(fm) for fm in f]
def extract_mfcc(signal: np.ndarray, sample_rate: int = 44100, window: float = 0.5, stride: float = 0.25): feats, f_names = audioFeatureExtraction.stFeatureExtraction( signal, sample_rate, sample_rate * window, stride * sample_rate) return feats.T, f_names
def preProcess(fileName): [Fs, x] = audioBasicIO.readAudioFile(fileName) if (len(x.shape) > 1 and x.shape[1] == 2): x = np.mean(x, axis=1, keepdims=True) else: x = x.reshape(x.shape[0], 1) F, f_names = audioFeatureExtraction.stFeatureExtraction( x[:, 0], Fs, 0.050 * Fs, 0.025 * Fs) return (f_names, F)
def get_desired_features(self, int_audio_frames): feature_vector = fe.stFeatureExtraction(int_audio_frames, self.sample_rate, self.frame_size_samples, self.frame_step_samples) output = [] for desired_index in self.desired_features_indices: output.append(feature_vector[desired_index]) return np.array(output)
def analyze_audio_response(filename): #This function analyzes the yes/no response of a .wav file and returns a 34x (number of time frames depending on how long the response is) matrix from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import matplotlib.pyplot as plt [Fs, x] = audioBasicIO.readAudioFile(filename) F = audioFeatureExtraction.stFeatureExtraction(x[:, 0], Fs, Fs * 0.05, Fs * 0.025) return F
def extract_features(self, file_path): [Fs, x] = audioBasicIO.readAudioFile(file_path) x = audioBasicIO.stereo2mono(x) # necessary conversion for pyaudio analysis features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs) features = np.mean(features, axis=1) features = np.asarray(features).reshape(len(features), -1).transpose() # features_complete = np.append(features_complete, features, axis=0) return features # _complete
def process_mp3_files(): files = read_input() os.system("touch test.wav") for mp3_file in files: mean_value = [] sound = AudioSegment.from_mp3(mp3_file) sound.export("test.wav", format="wav") # print mp3_file [Fs, x] = audioBasicIO.readAudioFile("test.wav") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs) for i in range(len(F)): mean_value.append(numpy.mean(F[i])) compute_emotion(mean_value)
def add_audio_feature_extraction(Fs, x, label_id, features, labels): """ Input Fs: frequency x: signal label_id: label(genre) id features: array of ffts labels: array of labels Description extracts a bunch of features listed in here(https://github.com/tyiannak/pyAudioAnalysis/wiki/3.-Feature-Extraction) from x and appends it to features. """ F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs); features.append(F) labels.append(label_id)
def showFeatures(name): print("processing - " + name) [Fs, x] = audioBasicIO.readAudioFile(name) # print(x) F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.50 * Fs, 0.25 * Fs) # print(x.size, Fs, 0.50 * Fs, 0.25 * Fs) # a = F[0, :] # numpy.savetxt("foo.csv", a, delimiter=",") # plt.subplot(3, 1, 1) # plt.plot(F[0, :]) # plt.xlabel('Frame no') # plt.ylabel('ZCR') # # plt.subplot(3, 1, 2) # plt.plot(F[1, :]) # plt.xlabel('Frame no') # plt.ylabel('Energy') # # plt.subplot(3, 1, 3) # plt.plot(F[3, :]) # plt.xlabel('Frame no') # plt.ylabel('SC') # # plt.show() # items = ' '.join(map(str, a)) # print(items) # print("--", F[0, :]) vec = [ F[0, :].mean(), F[1, :].mean(), F[4, :].mean(), F[5, :].mean(), F[6, :].mean(), F[7, :].mean(), F[0, :].std(), F[1, :].std(), F[4, :].std(), F[5, :].std(), F[6, :].std(), F[7, :].std() ] vecstr = ' '.join(map(str, vec)) print("vector in audio.py : ",vecstr); melfeat = melfeature(F) # chromafeat = chromafeature(F) return vecstr + " " + melfeat
def main(argv): if argv[1] == "-shortTerm": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs); t2 = time.clock() perTime1 = duration / (t2-t1); print "short-term feature extraction: {0:.1f} x realtime".format(perTime1) elif argv[1] == "-classifyFile": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() aT.fileClassification("diarizationExample.wav", "svmSM","svm") t2 = time.clock() perTime1 = duration / (t2-t1); print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-mtClassify": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() [flagsInd, classesAll, acc] = aS.mtFileClassification("diarizationExample.wav", "svmSM", "svm", False, '') t2 = time.clock() perTime1 = duration / (t2-t1); print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-hmmSegmentation": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() aS.hmmSegmentation('diarizationExample.wav', 'hmmRadioSM', False, '') t2 = time.clock() perTime1 = duration / (t2-t1); print "HMM-based classification - segmentation \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-silenceRemoval": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); segments = aS.silenceRemoval(x, Fs, 0.050, 0.050, smoothWindow = 1.0, Weight = 0.3, plot = False) t2 = time.clock() perTime1 = duration / (t2-t1); print "Silence removal \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-thumbnailing": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("scottish.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x1, Fs1, 1.0, 1.0, 15.0) # find thumbnail endpoints t2 = time.clock() perTime1 = duration1 / (t2-t1); print "Thumbnail \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-noLDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() aS.speakerDiarization("diarizationExample.wav", 4, LDAdim = 0, PLOT = False) t2 = time.clock() perTime1 = duration1 / (t2-t1); print "Diarization \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-LDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() aS.speakerDiarization("diarizationExample.wav", 4, PLOT = False) t2 = time.clock() perTime1 = duration1 / (t2-t1); print "Diarization \t {0:.1f} x realtime".format(perTime1)
raise Exception("Dimension of the covariance matrix and data should match") invcov = cov.T mean = np.reshape(mean, (1, n)) x = x - (np.ones((d, 1))*mean).T fact = np.sum(((np.dot(invcov, x))*x), axis = 1) y = np.exp(-0.5*fact) y = np.divide(y, math.pow((2*math.pi), n)*np.std(cov)) return y # feature extraction from the library pyAudioAnalysis attribute = audioFeatureExtraction.stFeatureExtraction(x, Fs, SIZE_OF_WINDOW, SIZE_OF_STEP) # relationship between the similarity and the timestamp in the audio relation = [[1 for col in range(2)] for row in range(attribute.shape[1]/BLOCK_STEP)] while (END_OF_FILE == 0): if (FIRST_PAIR == 1): # for the first pari of the block block_i_index_start = 0 block_i_index_end = BLOCK_SIZE block_i_attribute = getMFCCs(block_i_index_start, block_i_index_end) block_i_mean = np.mean(block_i_attribute, axis=1) block_i_cov = np.cov(block_i_attribute) block_i_log_like = np.log(gauss(block_i_attribute, mean=block_i_mean, cov=block_i_cov))
def features(filename, tag): signal, sampfreq = lr.load(filename) features = afe.stFeatureExtraction(signal, sampfreq, 0.050 * sampfreq, 0.025 * sampfreq) return signal, sampfreq, features
def train_classifier(): data_set = [] for file in os.listdir("training_dataset/unhappy"): temp = [] mean_value = [] if file.endswith(".mp3"): # print "training_dataset/unhappy/"+file sound = AudioSegment.from_mp3("training_dataset/unhappy/" + file) sound.export("test.wav", format="wav") [Fs, x] = audioBasicIO.readAudioFile("test.wav") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs) for i in range(len(F)): temp.append(numpy.mean(F[i])) mean_value.append(temp) mean_value.append(1) data_set.append(mean_value) for file in os.listdir("training_dataset/happy"): temp = [] mean_value = [] if file.endswith(".mp3"): # print "training_dataset/happy/"+file sound = AudioSegment.from_mp3("training_dataset/happy/" + file) sound.export("test.wav", format="wav") [Fs, x] = audioBasicIO.readAudioFile("test.wav") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs) for i in range(len(F)): temp.append(numpy.mean(F[i])) mean_value.append(temp) mean_value.append(2) data_set.append(mean_value) for file in os.listdir("training_dataset/angry"): temp = [] mean_value = [] if file.endswith(".mp3"): # print "training_dataset/angry/"+file sound = AudioSegment.from_mp3("training_dataset/angry/" + file) sound.export("test.wav", format="wav") [Fs, x] = audioBasicIO.readAudioFile("test.wav") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs) for i in range(len(F)): temp.append(numpy.mean(F[i])) mean_value.append(temp) mean_value.append(3) data_set.append(mean_value) for file in os.listdir("training_dataset/neutral"): temp = [] mean_value = [] if file.endswith(".mp3"): # print "training_dataset/neutral/"+file sound = AudioSegment.from_mp3("training_dataset/neutral/" + file) sound.export("test.wav", format="wav") [Fs, x] = audioBasicIO.readAudioFile("test.wav") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs) for i in range(len(F)): temp.append(numpy.mean(F[i])) mean_value.append(temp) mean_value.append(4) data_set.append(mean_value) x = [] y = [] for i in range(len(data_set)): x.append(data_set[i][0]) y.append(data_set[i][1]) clf = RandomForestClassifier(n_estimators=30, max_features=6, max_depth=None, min_samples_split=1, bootstrap=True) clf = clf.fit(x, y) f2 = open("classifier.pickle", "wb") pickle.dump(clf, f2) f2.close()
from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import matplotlib.pyplot as plt [Fs1, x1] = audioBasicIO.readAudioFile("happy.wav"); [Fs2, x2] = audioBasicIO.readAudioFile("sad.wav"); # Fs is frequency # x is real data th = 100 # fixed fea length k12 = (len(x1)-800)/th/float(Fs1) k22 = (len(x2)-800)/th/float(Fs2) F1, f_names1 = audioFeatureExtraction.stFeatureExtraction(x1, Fs1, 0.05*Fs1, k12*Fs1); F2, f_names2 = audioFeatureExtraction.stFeatureExtraction(x2, Fs2, 0.05*Fs2, k22*Fs2); # stFeatureExtraction(signal, fs, win, step): # signal: the input signal samples # fs: the sampling freq (in Hz) # win: the short-term window size (in samples) # step: the short-term window step (in samples) ''' here, window size = 0.05*Fs = 0.05*16000 = 800 step size = 0.025*Fs = 0.024*16000 = 400 we can get n frames from signal with length 23776 400*n+800=23776 -> n=57.44 = 58 as below F.shape = (34,58)
def main(path): ds = Dataset(path) loader = Loader(path + "/train/", 32, 16) X = [] y = [] Z = [] ii = 0 for p in ds.trainTracks(): f = p.split("/") name = f[len(f) - 1] labelTeller = loader.loadLabelsForSoundfile(name) [Fs, x] = audioBasicIO.readAudioFile(p) F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.032 * Fs, 0.016 * Fs) G = zip(*F) N = 0 if len(G) > labelTeller.tellNoOfAllBlocks(): N = labelTeller.tellNoOfAllBlocks() else: N = len(G) for i in range(N): Z.append([G[i], labelTeller.tell(i)]) # i = 0 # for w in ds.windows(x,44100, 1410, 705): # mf = mfcc(w) # row = [i] # Z.append([mf[0],labelTeller.tell(i)]) # i = i+1 print p + " " + str(ii) + "/61" ii = ii + 1 print "shuffle" random.shuffle(Z) Z = zip(*Z) NN = 20000 L = NN R = NN FINAL = [[], []] for i in range(len(Z[0])): if Z[1][i] == "sing" and L > 0: L = L - 1 FINAL[0].append(Z[0][i]) FINAL[1].append(Z[1][i]) if Z[1][i] == "nosing" and R > 0: R = R - 1 FINAL[0].append(Z[0][i]) FINAL[1].append(Z[1][i]) clf = svm.SVC(cache_size=2000) print "######### " + str(len(Z[0])) clf.fit(FINAL[0], FINAL[1]) loader = Loader(path + "/test/", 32, 16) print "Loading test" for p in ds.validationTracks(): X = [] y = [] f = p.split("/") name = f[len(f) - 1] labelTeller = loader.loadLabelsForSoundfile(name) i = 0 [Fs, x] = audioBasicIO.readAudioFile(p) F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.032 * Fs, 0.016 * Fs) G = zip(*F) N = 0 if len(G) > labelTeller.tellNoOfAllBlocks(): N = labelTeller.tellNoOfAllBlocks() else: N = len(G) for i in range(N): X.append(G[i]) y.append(labelTeller.tell(i)) print "Starting prediction " + p Y = clf.predict(X) ok = 0 al = 0 for i in range(len(y)): if y[i] == Y[i]: ok = ok + 1 al = al + 1 print ok / float(al)
from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import matplotlib.pyplot as plt [Fs, x] = audioBasicIO.readAudioFile("../audio_data/doremi.wav") print Fs print len(x) #using a frame size of 50 msecs and a frame step of 25 msecs (50% overlap) F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs) """ stFeatureExtraction This function implements the shor-term windowing process. For each short-term window a set of features is extracted. This results to a sequence of feature vectors, stored in a numpy matrix. ARGUMENTS signal: the input signal samples Fs: the sampling freq (in Hz) Win: the short-term window size (in samples) Step: the short-term window step (in samples) RETURNS stFeatures: a numpy array (numOfFeatures x numOfShortTermWindows) """ print len(F) plt.subplot(2,1,1); plt.plot(F[0,:]); plt.xlabel('Frame no'); plt.ylabel('ZCR') plt.subplot(2,1,2); plt.plot(F[1,:]); plt.xlabel('Frame no'); plt.ylabel('Energy'); plt.show()