def setup_names_and_labels(meta_data,px): """ Function to get recording based labels for all recordings in feat_csv INPUT: ====== meta_data: numpy array containing all meta data with names in first column and labels in last column and one header line px: List with all recordings in audio dataset OUTPUT: ======= """ # list of all patient names in meta data meta_px = fix_patient_names(meta_data[1:, 0]) meta_y_px = map( float,list(meta_data[1:,-1]) ) y_px = [] temp_px = [] for k in range(len(meta_px)): for p in px: if meta_px[k] in p: temp_px.append(meta_px[k]) y_px.append(meta_y_px[k]) else: continue px = temp_px return px, y_px
def extract_features(dir,coughs_list,meta_data,N_SPLITS): """ First some admin shit """ recording_name = os.path.basename(os.path.normpath(dir)) # This is kindof useless but I get the label # so keeping it. # *** names = fix_patient_names(meta_data["StudyNum"]) TB_Score = meta_data["TB Score"] Labels = meta_data["TBResult"] for n in names: if n in recording_name: idx = names.index(n) tb_score = TB_Score[idx] y = Labels[idx] # get the cepstral means (need it later) f_mean = dir + "/" + recording_name + "_mean.txt" # Get list of columns columns_list = get_colNames() # *** """ Now the real stuff """ df = pd.DataFrame(columns = columns_list) # for each cough for k in range(0,len(coughs_list)): cough_event = coughs_list[k] segments,fs = split_cough(cough_event,N_SPLITS) # For each segment/split feature_vecs = [] for seg in segments: # take mean of all samples for each MFCC # to reduce MFCC array to list of 13 values MFCC_D_A = np.mean(calc_MFCC_D_A(seg,f_mean),axis = 1) kurt = kurtosis(seg) zcr = zero_crossing_rate_BruteForce(seg) LogE = get_LogEnergy(seg) vec = [recording_name,kurt,zcr,LogE] vec.extend(MFCC_D_A) vec.append(y) feature_vecs.append(vec) feature_matrix = np.vstack(feature_vecs) temp_df = pd.DataFrame(feature_matrix,columns=columns_list) df = df.append(temp_df) return df