def read_train_data(dataset, k_normal_val, k_normal_train, k_seizure_val, k_seizure_train): global train_counter_seizure global val_counter_seizure global train_counter_normal global val_counter_normal global patients print "read data and preprocess (fft and slicing)" channels = patients[dataset.user] print "read in channels", channels path = data_path + '/' + dataset.set_name + '/' + dataset.base_name print path # read in normal is_train_index = get_train_val_split(k_normal_train, k_normal_val) no_normal = k_normal_val + k_normal_train for i in xrange(no_normal): print "normal i", i sys.stdout.flush() data_1h = read_data_1h(path, '_0.mat', i * 6 + 1) ch_arrays = [] for ch in channels: ch_arrays.append( calcFFT(data_1h[:, ch], fft_width, overlap)[:, floor:ceil]) magnitude = np.stack(ch_arrays, axis=0) if is_train_index[i]: g.magnitudes_normal_train[train_counter_normal] = magnitude train_counter_normal += 1 else: g.magnitudes_normal_val[val_counter_normal] = magnitude val_counter_normal += 1 # read in seizure is_train_index = get_train_val_split(k_seizure_train, k_seizure_val) no_seizure = k_seizure_val + k_seizure_train for i in xrange(no_seizure): print "seizure i", i sys.stdout.flush() data_1h = read_data_1h(path, '_1.mat', i * 6 + 1) ch_arrays = [] for ch in channels: ch_arrays.append( calcFFT(data_1h[:, ch], fft_width, overlap)[:, floor:ceil]) magnitude = np.stack(ch_arrays, axis=0) if is_train_index[i]: g.magnitudes_seizure_train[train_counter_seizure] = magnitude train_counter_seizure += 1 else: g.magnitudes_seizure_val[val_counter_seizure] = magnitude val_counter_seizure += 1 print "Done reading in", no_normal, "no seizure hours and", no_seizure, "seizure hours"
def preprocess_test_data(): global magnitudes_test global test_counter print("Loading and preprocessing data...") no_files = 0 for dataset in datasets.all: if dataset.enabled and not dataset.trainset: no_files += int(dataset.no_files * args.debug_sub_ratio) print "no_files", no_files test = read_data(data_path + '/test_1/1_', '.mat', 1) test_magnitude = calcFFT(test[:, 0], fft_width, overlap)[:, floor:ceil] print "test_magnitude.shape", test_magnitude.shape stft_steps = test_magnitude.shape[0] magnitudes_test = np.zeros( (no_files, args.no_channels, stft_steps, ceil - floor), dtype=np.float32) print magnitudes_test.shape test_counter = 0 for dataset in datasets.all: if dataset.enabled and not dataset.trainset: print "Read in dataset from %s ..." % (dataset.set_name) nf = int(dataset.no_files * args.debug_sub_ratio) read_test_data(dataset, 0, nf) process = psutil.Process(os.getpid()) print("Memory usage (GB): " + str(process.memory_info().rss / 1e9))
def read_test_data(dataset, start, stop): global magnitudes_test global test_counter print "read data and preprocess (fft and slicing)" channels = patients[dataset.user] print "read in channels", channels path = data_path + '/' + dataset.set_name + '/' + dataset.base_name print path # read in normal for i in xrange(start, stop): #print "test i", i sys.stdout.flush() data = read_data(path, '.mat', i + 1) ch_arrays = [] for ch in channels: ch_arrays.append( calcFFT(data[:, ch], fft_width, overlap)[:, floor:ceil]) magnitude = np.stack(ch_arrays, axis=0) magnitudes_test[test_counter] = magnitude test_counter += 1 print "Done reading in", stop - start, "test snippets of 10min."
def preprocess(): global size global xTrain global udTrain global yTrain global aTrain global xVal global udVal global yVal global aVal global train_counter_seizure global val_counter_seizure global train_counter_normal global val_counter_normal global userdata global labels global analysis_datas print("Loading and preprocessing data...") no_normal_train = 0 no_normal_val = 0 no_seizure_train = 0 no_seizure_val = 0 for dataset in datasets.all: if dataset.enabled: no_normal_val += int(dataset.no_normal * args.debug_sub_ratio * args.chosen_validation_ratio) no_normal_train += int(dataset.no_normal * args.debug_sub_ratio * (1 - args.chosen_validation_ratio)) no_seizure_val += int(dataset.no_seizure * args.debug_sub_ratio * args.chosen_validation_ratio) no_seizure_train += int(dataset.no_seizure * args.debug_sub_ratio * (1 - args.chosen_validation_ratio)) no_normal = no_normal_val + no_normal_train no_seizure = no_seizure_val + no_seizure_train print "total" print no_normal print no_seizure print "train" print no_normal_train print no_seizure_train print "validation" print no_normal_val print no_seizure_val test = read_data_1h(data_path + '/train_1/1_', '_0.mat', 1) test_magnitude = calcFFT(test[:, 0], fft_width, overlap)[:, floor:ceil] print "test_magnitude.shape", test_magnitude.shape stft_steps = test_magnitude.shape[0] print no_seizure_train print no_seizure - no_seizure_train print no_normal_train print no_normal - no_normal_train g.magnitudes_seizure_train = np.zeros( (no_seizure_train, args.no_channels, stft_steps, ceil - floor), dtype=np.float32) g.magnitudes_seizure_val = np.zeros( (no_seizure_val, args.no_channels, stft_steps, ceil - floor), dtype=np.float32) g.magnitudes_normal_train = np.zeros( (no_normal_train, args.no_channels, stft_steps, ceil - floor), dtype=np.float32) g.magnitudes_normal_val = np.zeros( (no_normal_val, args.no_channels, stft_steps, ceil - floor), dtype=np.float32) # analysis_datas = np.zeros(size, dtype=analysis_data_type) global train_counter_seizure global val_counter_seizure train_counter_seizure = 0 val_counter_seizure = 0 global train_counter_normal global val_counter_normal train_counter_normal = 0 val_counter_normal = 0 no_dss = 0 for dataset in datasets.all: if dataset.enabled: no_dss += 1 for dataset in datasets.all: if dataset.enabled and dataset.trainset: print "Read in dataset from %s ..." % (dataset.set_name) print "Processing data ..." k_normal_val = int(dataset.no_normal * args.debug_sub_ratio * args.chosen_validation_ratio) k_normal_train = int(dataset.no_normal * args.debug_sub_ratio * (1 - args.chosen_validation_ratio)) k_seizure_val = int(dataset.no_seizure * args.debug_sub_ratio * args.chosen_validation_ratio) k_seizure_train = int(dataset.no_seizure * args.debug_sub_ratio * (1 - args.chosen_validation_ratio)) read_train_data(dataset, k_normal_val, k_normal_train, k_seizure_val, k_seizure_train) print 'train_counter_seizure', train_counter_seizure, 'val_counter_seizure', val_counter_seizure print 'train_counter_normal', train_counter_normal, 'val_counter_normal', val_counter_normal process = psutil.Process(os.getpid()) print("Memory usage (GB): " + str(process.memory_info().rss / 1e9)) print 'train_counter_seizure', train_counter_seizure, 'val_counter_seizure', val_counter_seizure print 'train_counter_normal', train_counter_normal, 'val_counter_normal', val_counter_normal print "percentiles:" for p in range(0, 101, 10): print p, np.percentile(g.magnitudes_normal_train, p), np.percentile(g.magnitudes_normal_val, p) multiplier = 1 no_samples_normal_ph = multiplier * no_seizure no_samples_seizure_ph = multiplier * no_normal size = no_normal * no_samples_normal_ph + no_seizure * no_samples_seizure_ph print "no_normal", no_normal print "no_seizure", no_seizure print "no_samples_normal_ph", no_samples_normal_ph print "no_samples_seizure_ph", no_samples_seizure_ph magnitudes = np.random.rand(size) labels = np.hstack((np.zeros(size / 2), np.ones(size / 2))) np.random.shuffle(labels) print "size", size print "no_normal", no_normal print "no_seizure", no_seizure print "no_samples_normal_ph", no_samples_normal_ph print "no_samples_seizure_ph", no_samples_seizure_ph labels = labels.astype(np.int32) magnitudes = magnitudes.astype(np.float32) print("Histogram:") print np.bincount(labels) print "magnitudes.shape", magnitudes.shape print "labels.shape", labels.shape no_val = int(math.floor(args.chosen_validation_ratio * size)) no_train = size - no_val assert no_train + no_val == size print 'Ratio validation:', no_val / float(size) if abs(no_val / float(size) - args.chosen_validation_ratio) > 0.02: print "WARNING: validation ratio (%g) differs from expected value (%g)" % ( no_val / float(size), args.chosen_validation_ratio) xTrain = magnitudes[:no_train] udTrain = [] if include_userdata: udTrain = userdata[:no_train] yTrain = labels[:no_train] xVal = magnitudes[no_train:] udVal = [] if include_userdata: udVal = userdata[no_train:] yVal = labels[no_train:] print "xVal.shape", xVal.shape print "yVal.shape", yVal.shape xVal = np.vstack((xVal, yVal)) xVal = np.swapaxes(xVal, 0, 1) #aVal = analysis_datas[no_train:] # print("Shuffling data...") # a = np.arange(xTrain.shape[0]) # np.random.shuffle(a) # xTrain = xTrain[a] # if include_userdata: # udTrain = udTrain[a] # yTrain = yTrain[a] # inorder to be able to release magnitudes array # xVal = np.copy(xVal) del magnitudes gc.collect() print 'xTrain.shape', xTrain.shape print 'yTrain.shape', yTrain.shape print 'xVal.shape', xVal.shape print 'yVal.shape', yVal.shape assert xTrain.shape[0] == yTrain.shape[0] assert xVal.shape[0] == yVal.shape[0] if not args.no_save_preprocessed: print("Saving preprocessed data...") data = { 'magnitudes_seizure_val': g.magnitudes_seizure_val, 'magnitudes_seizure_train': g.magnitudes_seizure_train, 'magnitudes_normal_val': g.magnitudes_normal_val, 'magnitudes_normal_train': g.magnitudes_normal_train, 'xTrain': xTrain, #'udTrain':udTrain, #'aTrain':aTrain, 'yTrain': yTrain, 'xVal': xVal, #'udVal':udVal, 'yVal': yVal, } hkl.dump(data, 'preprocessedData_16.hkl', compression="lzf")
def preprocess(): global size global xTrain global udTrain global yTrain global aTrain global xVal global udVal global yVal global aVal global train_counter_seizure global val_counter_seizure global train_counter_normal global val_counter_normal global userdata global labels global analysis_datas print("Loading and preprocessing data...") no_normal_train = 0 no_normal_val = 0 no_seizure_train = 0 no_seizure_val = 0 for dataset in datasets.all: if dataset.enabled: no_normal_val += int(dataset.no_normal * args.debug_sub_ratio * args.chosen_validation_ratio) no_normal_train += int(dataset.no_normal * args.debug_sub_ratio * (1-args.chosen_validation_ratio)) no_seizure_val += int(dataset.no_seizure * args.debug_sub_ratio * args.chosen_validation_ratio) no_seizure_train += int(dataset.no_seizure * args.debug_sub_ratio * (1-args.chosen_validation_ratio)) no_normal = no_normal_val + no_normal_train no_seizure = no_seizure_val + no_seizure_train print "total" print no_normal print no_seizure print "train" print no_normal_train print no_seizure_train print "validation" print no_normal_val print no_seizure_val test = read_data_1h(data_path+'/train_1/1_','_0.mat',1) test_magnitude = calcFFT(test[:,0],fft_width,overlap)[:,floor:ceil] print "test_magnitude.shape", test_magnitude.shape stft_steps = test_magnitude.shape[0] print no_seizure_train print no_seizure-no_seizure_train print no_normal_train print no_normal-no_normal_train g.magnitudes_seizure_train = np.zeros((no_seizure_train,args.no_channels,stft_steps,ceil-floor), dtype=np.float32) g.magnitudes_seizure_val = np.zeros((no_seizure_val,args.no_channels,stft_steps,ceil-floor), dtype=np.float32) g.magnitudes_normal_train = np.zeros((no_normal_train,args.no_channels,stft_steps,ceil-floor), dtype=np.float32) g.magnitudes_normal_val = np.zeros((no_normal_val,args.no_channels,stft_steps,ceil-floor), dtype=np.float32) # analysis_datas = np.zeros(size, dtype=analysis_data_type) global train_counter_seizure global val_counter_seizure train_counter_seizure = 0 val_counter_seizure = 0 global train_counter_normal global val_counter_normal train_counter_normal = 0 val_counter_normal = 0 no_dss = 0 for dataset in datasets.all: if dataset.enabled: no_dss += 1 for dataset in datasets.all: if dataset.enabled and dataset.trainset: print "Read in dataset from %s ..."%(dataset.set_name) print "Processing data ..." k_normal_val = int(dataset.no_normal * args.debug_sub_ratio * args.chosen_validation_ratio) k_normal_train = int(dataset.no_normal * args.debug_sub_ratio * (1-args.chosen_validation_ratio)) k_seizure_val = int(dataset.no_seizure * args.debug_sub_ratio * args.chosen_validation_ratio) k_seizure_train = int(dataset.no_seizure * args.debug_sub_ratio * (1-args.chosen_validation_ratio)) read_train_data(dataset,k_normal_val,k_normal_train,k_seizure_val,k_seizure_train) print 'train_counter_seizure', train_counter_seizure, 'val_counter_seizure', val_counter_seizure print 'train_counter_normal', train_counter_normal, 'val_counter_normal', val_counter_normal process = psutil.Process(os.getpid()) print("Memory usage (GB): "+str(process.memory_info().rss/1e9)) print 'train_counter_seizure', train_counter_seizure, 'val_counter_seizure', val_counter_seizure print 'train_counter_normal', train_counter_normal, 'val_counter_normal', val_counter_normal print "percentiles:" for p in range(0,101,10): print p, np.percentile(g.magnitudes_normal_train, p), np.percentile(g.magnitudes_normal_val, p) #Construct training vector train_multiplier = 1 gcd_ = gcd(no_normal,no_seizure) samplesph_normal = train_multiplier * no_seizure / gcd_ samplesph_seizure = train_multiplier * no_normal / gcd_ size_train = no_normal_train * samplesph_normal + no_seizure_train * samplesph_seizure magnitudes = np.hstack((np.arange(size_train/2),np.arange(size_train/2))) labels = np.hstack((np.zeros(size_train/2),np.ones(size_train/2))) np.random.shuffle(labels) yTrain = labels.astype(np.int32) xTrain = magnitudes.astype(np.float32) print("Histogram:") print np.bincount(yTrain) print "yTrain.shape", yTrain.shape print "xTrain.shape", xTrain.shape #Construct validation vector val_mult = 3 gcd_ = gcd(no_normal_val,no_seizure_val) samplesph_normal = val_mult * no_seizure_val / gcd_ samplesph_seizure = val_mult * no_normal_val / gcd_ size_val = no_normal_val * samplesph_normal + no_seizure_val * samplesph_seizure magnitudes = np.hstack((np.arange(size_val/2)%no_normal_val,np.arange(size_val/2)%no_seizure_val)) labels = np.hstack((np.zeros(size_val/2),np.ones(size_val/2))) yVal = labels.astype(np.int32) xVal = magnitudes.astype(np.float32) print("Histogram:") print np.bincount(yVal) print "yTrain.shape", yTrain.shape print "xTrain.shape", xVal.shape print "xVal.shape", xVal.shape print "yVal.shape", yVal.shape xVal = np.vstack((xVal,yVal)) xVal = np.swapaxes(xVal,0,1) size = size_val +size_train no_val = int(math.floor(args.chosen_validation_ratio * size)) no_train = size-no_val assert no_train + no_val == size print 'Ratio validation:', no_val/float(size) if abs(no_val/float(size) - args.chosen_validation_ratio) > 0.02: print "WARNING: validation ratio (%g) differs from expected value (%g)"%(no_val/float(size), args.chosen_validation_ratio) del magnitudes gc.collect() print 'xTrain.shape', xTrain.shape print 'yTrain.shape', yTrain.shape print 'xVal.shape', xVal.shape print 'yVal.shape', yVal.shape assert xTrain.shape[0] == yTrain.shape[0] assert xVal.shape[0] == yVal.shape[0] if args.save_preprocessed: print("Saving preprocessed data...") data = { 'magnitudes_seizure_val': g.magnitudes_seizure_val, 'magnitudes_seizure_train': g.magnitudes_seizure_train, 'magnitudes_normal_val': g.magnitudes_normal_val, 'magnitudes_normal_train': g.magnitudes_normal_train, 'xTrain':xTrain, #'udTrain':udTrain, #'aTrain':aTrain, 'yTrain':yTrain, 'xVal':xVal, #'udVal':udVal, 'yVal':yVal, } hkl.dump(data, 'preprocessedData.hkl',compression="lzf")