def predict_datapoint(input_sound, input_annotation): ''' loads one audio file and predicts its coutinuous valence ''' sr, samples = uf.wavread(input_sound) #load e_samples = uf.preemphasis(samples, sr) #apply preemphasis predictors = fa.extract_features(e_samples) #compute power law spectrum #normalize by training mean and std predictors = np.subtract(predictors, ref_mean) predictors = np.divide(predictors, ref_std) #load target target = pandas.read_csv(input_annotation) target = target.values target = np.reshape(target,(target.shape[0])) final_pred = [] #compute prediction until last frame start = 0 while start < (len(target)-SEQ_LENGTH): start_features = int(start * frames_per_annotation) stop_features = int((start + SEQ_LENGTH) * frames_per_annotation) predictors_temp = predictors[start_features:stop_features] predictors_temp = predictors_temp.reshape(1,predictors_temp.shape[0], predictors_temp.shape[1]) #predictors_temp = predictors_temp.reshape(1,predictors_temp.shape[0], predictors_temp.shape[1], 1) prediction = valence_model.predict(predictors_temp) for i in range(prediction.shape[1]): final_pred.append(prediction[0][i]) perc = int(float(start)/(len(target)-SEQ_LENGTH) * 100) print "Computing prediction: " + str(perc) + "%" start += SEQ_LENGTH #compute prediction for last frame predictors_temp = predictors[-int(SEQ_LENGTH*frames_per_annotation):] predictors_temp = predictors_temp.reshape(1,predictors_temp.shape[0], predictors_temp.shape[1]) prediction = valence_model.predict(predictors_temp) missing_samples = len(target) - len(final_pred) #last_prediction = prediction[0][-missing_samples:] reverse_index = np.add(list(reversed(range(missing_samples))),1) for i in reverse_index: final_pred.append(prediction[0][-i]) final_pred = np.array(final_pred) ''' #compute best prediction shift shifted_cccs = [] time = np.add(1,range(200)) print "Computing best optimization parameters" for i in time: t = target.copy() p = final_pred.copy() t = t[i:] p = p[:-i] #print t.shape, p.shape temp_ccc = ccc2(t, p) shifted_cccs.append(temp_ccc) best_shift = np.argmax(shifted_cccs) best_ccc = np.max(shifted_cccs) if best_shift > 0: best_target = target[best_shift:] best_pred = final_pred[:-best_shift] else: best_target = target best_pred = final_pred #print 'LEN BEST PRED: ' + str(len(best_pred)) #compute best parameters for the filter test_freqs = [] test_orders = [] test_cccs = [] freqs = np.arange(0.01,0.95,0.01) orders = np.arange(1,10,1) print "Finding best optimization parameters..." for freq in freqs: for order in orders: test_signal = best_pred.copy() b, a = butter(order, freq, 'low') filtered = filtfilt(b, a, test_signal) temp_ccc = ccc2(best_target, filtered) test_freqs.append(freq) test_orders.append(order) test_cccs.append(temp_ccc) best_filter = np.argmax(test_cccs) best_order = test_orders[best_filter] best_freq = test_freqs[best_filter] ''' #POSTPROCESSING #normalize between -1 and 1 final_pred = np.multiply(final_pred, 2.) final_pred = np.subtract(final_pred, 1.) #apply f_trick ann_folder = '../dataset/Training/Annotations' target_mean, target_std = uf.find_mean_std(ann_folder) final_pred = uf.f_trick(final_pred, target_mean, target_std) #apply butterworth filter b, a = butter(3, 0.01, 'low') final_pred = filtfilt(b, a, final_pred) ccc = ccc2(final_pred, target) #compute ccc print "CCC = " + str(ccc) ''' plt.plot(target) plt.plot(final_pred, alpha=0.7) plt.legend(['target','prediction']) plt.show() ''' return ccc
end = end + num_frames label_slice = labels[start:end] video_slice = video_data[start:end] audio_slice = audio_data[start * frames_per_annotation:end * frames_per_annotation] if not np.array_equal(label_slice, annotations): print '{} label slice and annotations do not match! Num annotations different: {}'.format( name, (label_slice != annotations).sum()) # raise Exception('{} label slice and annotations do not match!'.format(name)) predictions = predict_datapoint(audio_slice, video_slice, label_slice) print predictions[:10], predictions[-10:] target_mean = np.mean(train_labels) target_std = np.std(train_labels) final_pred = uf.f_trick(predictions, target_mean, target_std) #apply butterworth filter b, a = butter(1, 0.004, 'low') final_pred = filtfilt(b, a, final_pred) # output to csv file preds = {'valence': final_pred} df = pd.DataFrame(preds, columns=['valence']) # change this folder for different models df.to_csv(model_output_path + name + '.csv', index=None, header=True) ccc = ccc2(label_slice, final_pred) print '{} ccc {}'.format(name, ccc) cccs.append(ccc)
audio_gen_val = uf.audio_generator(speech_valid_x, validation_target, SEQ_LENGTH, batch_size, frames_per_annotation) print 'Dataset successfully loaded' print 'Getting predictions...' predictions = valence_model.predict_generator( audio_gen_val.generate_no_shuffle(), steps=audio_gen_val.stp_per_epoch) predictions = predictions.reshape(predictions.shape[0]) # apply f_trick ann_folder = '../dataset/Training/Annotations' target_mean, target_std = uf.find_mean_std(ann_folder) predictions = uf.f_trick(predictions, target_mean, target_std) #apply butterworth filter b, a = butter(3, 0.01, 'low') predictions = filtfilt(b, a, predictions) print predictions print validation_target ccc = ccc2(predictions, validation_target[15:]) #compute ccc print "CCC = " + str(ccc) def predict_datapoint(input_sound, input_annotation): ''' loads one audio file and predicts its coutinuous valence