def train_mean_std(disp=True): """Outputs the mean and standard deviation of each sensor, for the training data ONLY""" csvlist = io.get_file_list(mode='train', fullpath=True) pif = lambda msg: printflush(msg) if disp else None # Sum all the means and STD together mean = np.zeros(32, dtype=DTYPE) var = np.zeros(32, dtype=DTYPE) for fullpath in csvlist: t0 = time() fpath, fname = os.path.split(fullpath) data = pd.read_csv(fullpath).values[:,1:] pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...') mean += np.mean(data, axis=0, dtype=DTYPE) var += np.var(data, axis=0, dtype=DTYPE) pif("\b" + "%.3f"%(time()-t0) + " s\n") # Divide by # of datasets dataset_count = len(csvlist) mean /= dataset_count var /= dataset_count # Sqrt the variance std = np.sqrt(var) # print representation print(repr(mean)) print(repr(std))
def preprocess_all_mk2(mode='train', disp=True): """Preprocesses all the data. Mean cancellation by subtracting SENSOR_MEAN and scaling with SENSOR_STD""" csvlist = io.get_file_list(mode=mode, fullpath=True) pif = lambda msg: printflush(msg) if disp else None pif('MK2 preprocessing for ' + mode + ' data\n') for fullpath in csvlist: t0 = time() fpath, fname = os.path.split(fullpath) data = pd.read_csv(fullpath).values[:,1:] pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...') # Removes the mean of each sensor data -= utils.SENSOR_MEAN # Scale the data with the standard deviation from the training data data /= utils.SENSOR_STD final_fname = fullpath[:-4] + '_mk2' np.save(final_fname, data) pif("%.3f"%(time()-t0) + " s\n")
def preprocess_all_mk1(norm_wind=None, div_factor=300, mode='train', disp=True): """Preprocesses all the data. Simply scales the data with div_factor, then applies running zeromean""" csvlist = io.get_file_list(mode=mode, fullpath=True) pif = lambda msg: printflush(msg) if disp else None pif('MK1 preprocessing') for fullpath in csvlist: t0 = time() fpath, fname = os.path.split(fullpath) data = pd.read_csv(fullpath).values[:,1:] pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...') # Scale the data data /= float(div_factor) # Execute the running mean if norm_wind is not None: wind = norm_wind else: wind = data.shape[0] final_data = utils.running_zeromean(data, wind, axis=0) pif("\b" + "%.3f"%(time()-t0) + " s\n") str_wind = 'FULL' if wind==data.shape[0] else str(wind) final_fname = fullpath[:-4] + '_mk1_norm' + str_wind np.save(final_fname, final_data)
def preprocess_all_mk0(norm_wind=None, nperseg=256, mode='train', max_freq_count=10, disp=True): """Preprocesses all the data. Append the 10 highest frequency components of each previous nperseg window""" csvlist = io.get_file_list(mode=mode, fullpath=True) pif = lambda msg: printflush(msg) if disp else None pif('MK0 preprocessing') for fullpath in csvlist: t0 = time() fpath, fname = os.path.split(fullpath) data = pd.read_csv(fullpath).values[:,1:] pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...') # Get the spectrograph f,t,sxx = utils.spectrogram(data, window='boxcar', nperseg=nperseg) #spectro_fname = fullpath[:-4] + '_spectro' #np.save(spectro_fname, sxx) # N Principal frequencies (a normalized index) max_freqs = principal_frequencies(sxx, max_freq_count) # BLow up the max frequencies to match the data array repeated_max_freqs = np.zeros((data.shape[0], max_freq_count), dtype=max_freqs.dtype) tmp = np.zeros((1, max_freqs.shape[1])) max_freqs = np.insert(max_freqs, 0, tmp, axis=0) for k in range(0,max_freqs.shape[0]-1): repeated_max_freqs[k*nperseg:(k+1)*nperseg,:] = np.tile(max_freqs[k,:], (nperseg,1)) final_index = k # Execute the running mean if norm_wind is not None: wind = norm_wind else: wind = data.shape[0] norm_data = utils.running_zeromean(data, wind, axis=0) pif("\b" + "%.3f"%(time()-t0) + " s\n") # Concatenate #del data final_data = np.append(norm_data, repeated_max_freqs, axis=1) #del norm_data str_wind = 'FULL' if wind==data.shape[0] else wind final_fname = fullpath[:-4] + '_mk0' + '_W' + str(nperseg) + '_norm' + str(str_wind) np.save(final_fname, final_data)
def preprocess_all_mk3(mode='train', wind=3, butter_order=4, disp=True): """Preprocesses all the data. Mean cancellation by subtracting SENSOR_MEAN and scaling with SENSOR_STD an MA filter is used to reduce the impact of high frequency noise """ csvlist = io.get_file_list(mode=mode, fullpath=True) pif = lambda msg: printflush(msg) if disp else None pif('MK3 preprocessing for ' + mode + ' data\n') for fullpath in csvlist: t0 = time() fpath, fname = os.path.split(fullpath) data = pd.read_csv(fullpath).values[:,1:] pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...') # Removes the mean of each sensor data -= utils.SENSOR_MEAN # Scale the data with the standard deviation from the training data data /= utils.SENSOR_STD # Moving average, to remove outliers data = utils.mov_avg(data, wind, axis=0) # TODO # Filter the data brain_list = [] freq_mask = 0 for flo, fhi in utils.FREQUENCY_BANDS.itervalues(): brain_list.append(utils.butter_apply(data, low=flo, high=fhi)) freq_mask = int(round(flo+fhi*10)) del data #Free some memory! final_data = np.concatenate(brain_list, axis=1) # Save preprocessed data and print stuff to console str_wind = 'FULL' if wind==final_data.shape[0] else str(wind) final_fname = fullpath[:-4] + '_mk3_wind' + str_wind + '_fmask' + str(freq_mask) np.save(final_fname, final_data) del brain_list, final_data # Free some memory for the next datafile pif("%.3f"%(time()-t0) + " s\n")