def preprocess_datapoint(input_sound, input_annotation): ''' generate predictors (stft) and target (valence sequence) of one sound file from the OMG dataset ''' sr, samples = uf.wavread(input_sound) #read audio e_samples = uf.preemphasis(samples, sr) #apply preemphasis feats = fa.extract_features(e_samples) #extract features annotation = pandas.read_csv(input_annotation) #read annotations annotation = annotation.values annotation = np.reshape(annotation, annotation.shape[0]) annotated_frames = int(len(annotation) * frames_per_annotation) feats = feats[:annotated_frames] #discard non annotated final frames annotation = annotation[TARGET_DELAY:] #shift back annotations by target_delay feats2 = feats[:-frames_delay] return feats, annotation
def extract_LLD_datapoint(input_sound, input_annotation): ''' load one audio file and compute the model's last latent dimension ''' sr, samples = uf.wavread(input_sound) #load e_samples = uf.preemphasis(samples, sr) #apply preemphasis predictors = fa.extract_features(e_samples) #compute power law spectrum #normalize by training mean and std predictors = np.subtract(predictors, ref_mean) predictors = np.divide(predictors, ref_std) final_vec = np.array([]) #load target target = pandas.read_csv(input_annotation) target = target.values target = np.reshape(target, (target.shape[0])) #compute last latent dim until last frame start = 0 while start < (len(target) - SEQ_LENGTH): start_features = int(start * frames_per_annotation) stop_features = int((start + SEQ_LENGTH) * frames_per_annotation) predictors_temp = predictors[start_features:stop_features] predictors_temp = predictors_temp.reshape(1, predictors_temp.shape[0], predictors_temp.shape[1]) features_temp = latent_extractor([predictors_temp]) features_temp = np.reshape(features_temp, (SEQ_LENGTH, feats_per_valence)) if final_vec.shape[0] == 0: final_vec = features_temp else: final_vec = np.concatenate((final_vec, features_temp), axis=0) print 'Progress: ' + str( int(100 * (final_vec.shape[0] / float(len(target))))) + '%' start += SEQ_LENGTH #compute last latent dim for last frame predictors_temp = predictors[-int(SEQ_LENGTH * frames_per_annotation):] predictors_temp = predictors_temp.reshape(1, predictors_temp.shape[0], predictors_temp.shape[1]) features_temp = latent_extractor([predictors_temp]) features_temp = np.reshape(features_temp, (SEQ_LENGTH, feats_per_valence)) missing_samples = len(target) - final_vec.shape[0] last_vec = features_temp[-missing_samples:] final_vec = np.concatenate((final_vec, last_vec), axis=0) return final_vec
def predict_datapoint(input_sound, input_annotation): ''' loads one audio file and predicts its coutinuous valence ''' sr, samples = uf.wavread(input_sound) #load e_samples = uf.preemphasis(samples, sr) #apply preemphasis predictors = fa.extract_features(e_samples) #compute power law spectrum #normalize by training mean and std predictors = np.subtract(predictors, ref_mean) predictors = np.divide(predictors, ref_std) #load target target = pandas.read_csv(input_annotation) target = target.values target = np.reshape(target,(target.shape[0])) final_pred = [] #compute prediction until last frame start = 0 while start < (len(target)-SEQ_LENGTH): start_features = int(start * frames_per_annotation) stop_features = int((start + SEQ_LENGTH) * frames_per_annotation) predictors_temp = predictors[start_features:stop_features] predictors_temp = predictors_temp.reshape(1,predictors_temp.shape[0], predictors_temp.shape[1]) #predictors_temp = predictors_temp.reshape(1,predictors_temp.shape[0], predictors_temp.shape[1], 1) prediction = valence_model.predict(predictors_temp) for i in range(prediction.shape[1]): final_pred.append(prediction[0][i]) perc = int(float(start)/(len(target)-SEQ_LENGTH) * 100) print "Computing prediction: " + str(perc) + "%" start += SEQ_LENGTH #compute prediction for last frame predictors_temp = predictors[-int(SEQ_LENGTH*frames_per_annotation):] predictors_temp = predictors_temp.reshape(1,predictors_temp.shape[0], predictors_temp.shape[1]) prediction = valence_model.predict(predictors_temp) missing_samples = len(target) - len(final_pred) #last_prediction = prediction[0][-missing_samples:] reverse_index = np.add(list(reversed(range(missing_samples))),1) for i in reverse_index: final_pred.append(prediction[0][-i]) final_pred = np.array(final_pred) ''' #compute best prediction shift shifted_cccs = [] time = np.add(1,range(200)) print "Computing best optimization parameters" for i in time: t = target.copy() p = final_pred.copy() t = t[i:] p = p[:-i] #print t.shape, p.shape temp_ccc = ccc2(t, p) shifted_cccs.append(temp_ccc) best_shift = np.argmax(shifted_cccs) best_ccc = np.max(shifted_cccs) if best_shift > 0: best_target = target[best_shift:] best_pred = final_pred[:-best_shift] else: best_target = target best_pred = final_pred #print 'LEN BEST PRED: ' + str(len(best_pred)) #compute best parameters for the filter test_freqs = [] test_orders = [] test_cccs = [] freqs = np.arange(0.01,0.95,0.01) orders = np.arange(1,10,1) print "Finding best optimization parameters..." for freq in freqs: for order in orders: test_signal = best_pred.copy() b, a = butter(order, freq, 'low') filtered = filtfilt(b, a, test_signal) temp_ccc = ccc2(best_target, filtered) test_freqs.append(freq) test_orders.append(order) test_cccs.append(temp_ccc) best_filter = np.argmax(test_cccs) best_order = test_orders[best_filter] best_freq = test_freqs[best_filter] ''' #POSTPROCESSING #normalize between -1 and 1 final_pred = np.multiply(final_pred, 2.) final_pred = np.subtract(final_pred, 1.) #apply f_trick ann_folder = '../dataset/Training/Annotations' target_mean, target_std = uf.find_mean_std(ann_folder) final_pred = uf.f_trick(final_pred, target_mean, target_std) #apply butterworth filter b, a = butter(3, 0.01, 'low') final_pred = filtfilt(b, a, final_pred) ccc = ccc2(final_pred, target) #compute ccc print "CCC = " + str(ccc) ''' plt.plot(target) plt.plot(final_pred, alpha=0.7) plt.legend(['target','prediction']) plt.show() ''' return ccc