def execute(training, op_exec): """ This function will help execute the tree """ data_op = prep.read_from_file(training) print(data_op) ts = prep.impute_missing_data(data_op) return_value = None if op_exec == "denoise": return_value = prep.impute_missing_data(ts) elif op_exec == "impute_missing_data": return_value = prep.impute_missing_data(ts) elif op_exec == "impute_outliers": return_value = prep.impute_outliers(ts) elif op_exec == "longest_continuous_run": return_value = prep.longest_continuous_run(ts) elif op_exec == "clip": return_value = prep.denoise(ts, self.starting_date, self.starting_date, self.final_date) elif op_exec == "assign_time": return_value = prep.assign_time(ts, self.starting_date, self.increment) elif op_exec == "difference": return_value = prep.difference(ts) elif op_exec == "scaling": return_value = prep.scaling(ts) elif op_exec == "standardize": return_value = prep.standardize(ts) elif op_exec == "logarithm": return_value = prep.logarithm(ts) elif op_exec == "cubic_roots": return_value = prep.cubic_roots(ts) return return_value
def check_selections(): sys.stdout = open(classification_selected_dir + 'classification_selected.txt', 'w') files = glob.glob(selected_dir) labels = np.loadtxt(labels_path, delimiter=',', skiprows=1, dtype=np.uint8) for file in files: data = np.loadtxt(file, delimiter=',', skiprows=1, dtype=np.uint64) print(os.path.basename(file)[:-4] + ": " + str(len(data[0]))) tree_methods(data, labels) print("------------------------------------------------------------") print('\n') preprocessing.standardize(selected_dir, standard_selected_dir, True) # aj ulozi scaler pre kazdu selekciu files = glob.glob(standard_selected_dir + '*') for file in files: standard_data = np.loadtxt(file, delimiter=',', skiprows=1, dtype=np.float64) print(os.path.basename(file)[:-4] + ": " + str(len(standard_data[0]))) svm_methods(standard_data, labels) print("------------------------------------------------------------") print('\n') sys.stdout.close()
def load(window_size): X_train = fio.load_file(X_train_dataset) Y_train = fio.load_file(Y_train_dataset) X_test = fio.load_file(X_test_dataset) Y_test = fio.load_file(Y_test_dataset) train_sample = fio.load_sample_file(train_sample_dataset) valid_sample = fio.load_sample_file(valid_sample_dataset) stat = pc.get_feat_stat(X_train) X_train = pc.standardize(X_train, stat) X_test = pc.standardize(X_test, stat) X_train = pc.expand(X_train, window_size) X_test = pc.expand(X_test, window_size) Y_train = pc.classify(Y_train) Y_test = pc.classify(Y_test) testing_sample = [ np.indices((x.shape[0], x.shape[1])).reshape((2, -1)).T for x in X_test ] return X_train, Y_train, X_test, Y_test, train_sample, valid_sample, testing_sample
def S2I_ensemble(ERSP_all, tmp_all, freqs, indices, num_time, n_split, index_exp=0): ''' Standardize data, then generate topo for ensemble methods Parameters ---------- ERSP_all : numpy 4d array Event related spectral perturbations tmp_all : numpy 1d or 2d array Periods of time or solution latency freqs : numpy 1d array Frequency steps indices : dict Indices of training and testing data num_time : int Number of frame for each trials n_split : int Number of split clusters index_exp : int Index of experiment for K-fold cross validation Returns ------- None. ''' assert isinstance(ERSP_all, np.ndarray) and ERSP_all.ndim == 4 assert isinstance(tmp_all, np.ndarray) and (tmp_all.ndim == 1 or tmp_all.ndim == 2) assert isinstance(freqs, np.ndarray) and freqs.ndim == 1 assert isinstance(indices, dict) assert isinstance(num_time, int) assert isinstance(n_split, int) and n_split > 1 assert isinstance(index_exp, int) and index_exp >= 0 # Create folder for exp and train, test if not os.path.exists('./images/exp%d' % (index_exp)): os.makedirs('./images/exp%d' % (index_exp)) for i in range(n_split): os.makedirs('./images/exp%d/train%d' % (index_exp, i)) os.makedirs('./images/exp%d/test' % (index_exp)) # Standardize the data ERSP_all, SLs = preprocessing.standardize(ERSP_all, tmp_all, num_time, train_indices=indices['train'], threshold=0.0) ERSP_dict = { kind: ERSP_all[indices[kind], :] for kind in ['train', 'test'] } SLs_dict = {kind: SLs[indices[kind]] for kind in ['train', 'test']} ERSP_list, SLs_list = preprocessing.stratified_split(ERSP_dict['train'], SLs_dict['train'], n_split=n_split, mode=args.split_mode) start_time = time.time() print('[%.1f] Signal to image (Ensemble)' % (time.time() - start_time)) # Generate topoplot for training data in each split for index_split in range(n_split): print('--- Split %d ---' % (index_split)) # Data augmentation if args.mode == 'add_noise': ERSP_split, SLs_split = data_augmentation.aug( ERSP_list[index_split], SLs_list[index_split], 'add_noise', (5, 1)) elif args.mode == 'SMOTER': ERSP_split, SLs_split = data_augmentation.aug( ERSP_list[index_split], SLs_list[index_split], 'SMOTER') else: ERSP_split, SLs_split = ERSP_list[index_split], SLs_list[ index_split] fileNames = generate_topo(ERSP_split, freqs, num_time, index_exp=index_exp, index_split=index_split) generate_csv(fileNames, SLs_split, index_exp, index_split) if index_split == 0: fileNames_train = fileNames SLs_train = SLs_split else: fileNames_train = np.concatenate((fileNames_train, fileNames)) SLs_train = np.concatenate((SLs_train, SLs_list[index_split])) # Generate topoplot for all training data generate_csv(fileNames_train, SLs_train, index_exp, 100) # Generate topo for testing data print('--- Split test ---') fileNames = generate_topo(ERSP_dict['test'], freqs, num_time, index_exp=index_exp, index_split=-1) generate_csv(fileNames, SLs_dict['test'], index_exp, -1) print('[%.1f] Finish S2I' % (time.time() - start_time))
def S2I_main(ERSP_all, tmp_all, freqs, indices, mode, num_time, index_exp=0, sub_ID=None): ''' Standardize data, then generate topo Parameters ---------- ERSP_all : numpy 4d array Event related spectral perturbations tmp_all : numpy 1d or 2d array Periods of time or solution latency freqs : numpy 1d array Frequency steps indices : dict Indices of training and testing data mode : string Multiframe or single frame or SMOTE num_time : int Number of frame for each trials index_exp : int Index of experiment for K-fold cross validation Returns ------- None. ''' assert isinstance(ERSP_all, np.ndarray) and ERSP_all.ndim == 4 assert isinstance(tmp_all, np.ndarray) and (tmp_all.ndim == 1 or tmp_all.ndim == 2) assert isinstance(freqs, np.ndarray) and freqs.ndim == 1 assert isinstance(indices, dict) assert isinstance(mode, str) assert isinstance(num_time, int) assert isinstance(index_exp, int) and index_exp >= 0 # Create folder for exp and train, test if not os.path.exists('./images/exp%d' % (index_exp)): os.makedirs('./images/exp%d' % (index_exp)) os.makedirs('./images/exp%d/train0' % (index_exp)) os.makedirs('./images/exp%d/test' % (index_exp)) # Standardize the data ERSP_all, SLs = preprocessing.standardize(ERSP_all, tmp_all, num_time, train_indices=indices['train'], threshold=0.0) # Normalize subjects if args.normal_sub: ERSP_all = preprocessing.normalize_subject(ERSP_all, sub_ID, indices['train']) ERSP_dict = { kind: ERSP_all[indices[kind], :] for kind in ['train', 'test'] } SLs_dict = {kind: SLs[indices[kind]] for kind in ['train', 'test']} # Data augmentation if mode == 'SMOTER': ERSP_dict['train'], SLs_dict['train'] = data_augmentation.aug( ERSP_dict['train'], SLs_dict['train'], 'SMOTER') elif mode == 'add_noise': ERSP_dict['train'], SLs_dict['train'] = data_augmentation.aug( ERSP_dict['train'], SLs_dict['train'], 'add_noise', (10, 1)) # Concatenate training and testing data ERSP_concat = np.concatenate((ERSP_dict['train'], ERSP_dict['test']), axis=0) SLs_concat = np.concatenate((SLs_dict['train'], SLs_dict['test']), axis=0) start_time = time.time() print('[%.1f] Signal to image (%s)' % (time.time() - start_time, mode)) fileNames = generate_topo(ERSP_concat, freqs, num_time, np.arange(ERSP_dict['train'].shape[0]), index_exp) split(fileNames, SLs_concat, len(SLs_dict['test']), random=False, index_exp=index_exp) print('[%.1f] Finished S2I' % (time.time() - start_time)) if mode == 'normal': if args.data_cate == 2: print('Generate conditional entropy of exp%d' % (index_exp)) with open( './raw_data/CE_sub%d_channel%d.data' % (args.subject_ID, args.num_channels), 'rb') as fp: CE_all = pickle.load(fp) for model_mode in ['train', 'test']: with open( './raw_data/CE_sub%d_channel%d_exp%d_%s.data' % (args.subject_ID, args.num_channels, index_exp, model_mode), 'wb') as fp: pickle.dump(CE_all[indices[model_mode], :], fp)
def main(index_exp, index_split): faulthandler.enable() torch.cuda.empty_cache() best_error = 100 lr_step = [40, 70, 120] multiframe = ['convlstm', 'convfc'] dirName = '%s_data%d_%s_%s_%s'%(args.model_name, args.data_cate, args.augmentation, args.loss_type, args.file_name) fileName = '%s_split%d_exp%d'%(dirName, index_split, index_exp) # Create folder for results of this model if not os.path.exists('./results/%s'%(dirName)): os.makedirs('./results/%s'%(dirName)) # ------------- Wrap up dataloader ----------------- if args.input_type == 'signal': X, Y_reg, C = raw_dataloader.read_data([1,2,3], list(range(11)), channel_limit=21, rm_baseline=True) num_channel = X.shape[1] num_feature = X.shape[2] # Number of time sample # Remove trials X, Y_reg = preprocessing.remove_trials(X, Y_reg, threshold=60) # Split data for cross validation if args.num_fold == 1: train_data, test_data, train_target, test_target = train_test_split(X, Y_reg, test_size=0.1, random_state=23) # Random state 15: training error becomes lower, testing error becomes higher else: kf = KFold(n_splits=args.num_fold, shuffle=True, random_state=23) for i, (train_index, test_index) in enumerate(kf.split(X)): if i == index_exp: train_data, train_target = X[train_index, :], Y_reg[train_index] test_data, test_target = X[test_index, :], Y_reg[test_index] # Split data for ensemble methods if not args.ensemble: if args.num_split > 1: data_list, target_list = preprocessing.stratified_split(train_data, train_target, n_split=args.num_split, mode=args.split_mode) train_data, train_target = data_list[index_split], target_list[index_split] ''' kf = KFold(n_splits=args.num_split, shuffle=True, random_state=32) for i, (other_index, split_index) in enumerate(kf.split(train_data)): if i == index_split: train_data, train_target = train_data[split_index, :], train_target[split_index] ''' # Normalize the data if args.normalize: train_data, test_data = preprocessing.normalize(train_data, test_data) # Data augmentation if args.augmentation == 'overlapping': train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation, (256, 64, 128)) test_data, test_target = data_augmentation.aug(test_data, test_target, args.augmentation, (256, 64, 128)) elif args.augmentation == 'add_noise': train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation, (30, 1)) elif args.augmentation == 'add_noise_minority': train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation, (30, 1)) elif args.augmentation == 'SMOTER': train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation) # scale data if args.scale_data: train_data, test_data = train_data.reshape((train_data.shape[0],-1)), test_data.reshape((test_data.shape[0],-1)) train_data, test_data = preprocessing.scale(train_data, test_data) train_data = train_data.reshape((train_data.shape[0],num_channel, -1)) test_data = test_data.reshape((test_data.shape[0],num_channel, -1)) if args.model_name in ['eegnet', 'eegnet_trans_signal']: # (sample, channel, time) -> (sample, channel_NN, channel_EEG, time) [train_data, test_data] = [X.reshape((X.shape[0], 1, num_channel, num_feature)) \ for X in [train_data, test_data]] (train_dataTS, train_targetTS, test_dataTS, test_targetTS) = map( torch.from_numpy, (train_data, train_target, test_data, test_target)) [train_dataset,test_dataset] = map(\ Data.TensorDataset, [train_dataTS.float(),test_dataTS.float()], [train_targetTS.float(),test_targetTS.float()]) if not args.str_sampling: train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size) model_param = [train_data.shape] elif args.input_type == 'power': if args.data_cate == 1: ERSP_all, tmp_all, freqs = dataloader.load_data() elif args.data_cate == 2: data_file = './raw_data/ERSP_from_raw_%d_channel21.data'%(args.index_sub) with open(data_file, 'rb') as fp: dict_ERSP = pickle.load(fp) ERSP_all, tmp_all = dict_ERSP['ERSP'], dict_ERSP['SLs'] num_channel = ERSP_all.shape[1] num_freq = ERSP_all.shape[2] # Remove trials ERSP_all, tmp_all = preprocessing.remove_trials(ERSP_all, tmp_all, threshold=60) # Split data for cross validation if args.num_fold == 1: train_data, test_data, train_target, test_target = train_test_split(ERSP_all, tmp_all[:,2], test_size=0.1, random_state=23) else: kf = KFold(n_splits=args.num_fold, shuffle=True, random_state=23) for i, (train_index, test_index) in enumerate(kf.split(ERSP_all)): if i == index_exp: train_data, test_data = ERSP_all[train_index, :], ERSP_all[test_index, :] if args.data_cate == 2: train_target, test_target = tmp_all[train_index], tmp_all[test_index] else: train_target, test_target = tmp_all[train_index, 2], tmp_all[test_index, 2] if args.add_CE: assert args.data_cate == 2 with open('./raw_data/CE_sub%d'%(args.index_sub), 'rb') as fp: CE = pickle.load(fp) CE_train, CE_test = CE[train_index,:], CE[test_index,:] # PCA for CE pca = PCA(n_components=10) pca.fit(CE_train) CE_train, CE_test = pca.transform(CE_train), pca.transform(CE_test) # Split data for ensemble methods if not args.ensemble: if args.num_split > 1: data_list, target_list = preprocessing.stratified_split(train_data, train_target, n_split=args.num_split, mode=args.split_mode) train_data, train_target = data_list[index_split], target_list[index_split] ''' kf = KFold(n_splits=args.num_split, shuffle=True, random_state=32) for i, (other_index, split_index) in enumerate(kf.split(np.arange(len(train_data)))): if i == index_split: train_data, train_target = train_data[split_index, :], train_target[split_index] ''' # Concatenate train and test for standardizinsg data = np.concatenate((train_data, test_data), axis=0) target = np.concatenate((train_target, test_target)) # Standardize data num_train = len(train_data) data, target = preprocessing.standardize(data, target, train_indices = np.arange(num_train), threshold=0.0) data = data.reshape((data.shape[0], -1)) # Scale target between 0 and 1 if args.post_scale: print('Scale the target between 0-1') target = target/60 # Split data train_data, test_data = data[:num_train, :], data[num_train:, :] train_target, test_target = target[:num_train], target[num_train:] # Data augmentation if args.augmentation == 'SMOTER': train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation) # center data if args.center_flag: train_data, test_data = preprocessing.center(train_data, test_data) # scale data if args.scale_data: train_data, test_data = preprocessing.scale(train_data, test_data) # Add conditional entropy if args.add_CE: train_data = np.concatenate((train_data, CE_train), axis=1) test_data = np.concatenate((test_data, CE_train), axis=1) if args.model_name == 'eegnet_trans_power': # (sample, channel, freq) -> (sample, channel_NN, channel_EEG, freq) [train_data, test_data] = [X.reshape((X.shape[0], 1, num_channel, num_freq)) \ for X in [train_data, test_data]] (train_dataTS, train_targetTS, test_dataTS, test_targetTS) = map( torch.from_numpy, (train_data, train_target, test_data, test_target)) [train_dataset,test_dataset] = map(\ Data.TensorDataset, [train_dataTS.float(),test_dataTS.float()], [train_targetTS.float(),test_targetTS.float()]) if not args.str_sampling: train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size) model_param = [train_data.shape] elif args.input_type == 'image': if args.ensemble: input_model_name = args.pre_model_name else: input_model_name = args.model_name assert (input_model_name in multiframe) == (args.num_time>1) # Let input size be 224x224 if the model is vgg16 if input_model_name in ['vgg16', 'resnet50']: input_size = 224 else: input_size = 64 # Load Data data_transforms = { 'train': transforms.Compose([ ndl.Rescale(input_size, args.num_time), ndl.ToTensor(args.num_time)]), 'test': transforms.Compose([ ndl.Rescale(input_size, args.num_time), ndl.ToTensor(args.num_time)]) } print("Initializing Datasets and Dataloaders...") # Create training and testing datasets # image_datasets = {x: ndl.TopoplotLoader(args.image_folder, x, args.num_time, data_transforms[x], # scale=args.scale_image, index_exp=index_exp, index_split=index_split) for x in ['train', 'test']} [train_dataset,test_dataset] = [ndl.TopoplotLoader(args.image_folder, x, args.num_time, data_transforms[x], scale=args.scale_image, index_exp=index_exp, index_split=index_split) for x in ['train', 'test']] # Create training and testing dataloaders # if not args.str_sampling: # train_loader = Data.DataLoader(image_datasets['train'], batch_size=args.batch_size, shuffle=True, num_workers=4) # test_loader = Data.DataLoader(image_datasets['test'], batch_size=args.batch_size, shuffle=False, num_workers=4) if not args.str_sampling: train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4) test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4) model_param = [input_size] elif args.input_type == 'EEGLearn_img': # Load data with open('./EEGLearn_imgs/data1.data', 'rb') as fp: dict_data = pickle.load(fp) data, target = dict_data['data'], dict_data['target'] input_size = data.shape[2] # Split data for cross validation if args.num_fold == 1: train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.1, random_state=23) # Random state 15: training error becomes lower, testing error becomes higher else: kf = KFold(n_splits=args.num_fold, shuffle=True, random_state=23) for i, (train_index, test_index) in enumerate(kf.split(data)): if i == index_exp: train_data, train_target = data[train_index, :], target[train_index] test_data, test_target = data[test_index, :], target[test_index] (train_dataTS, train_targetTS, test_dataTS, test_targetTS) = map( torch.from_numpy, (train_data, train_target, test_data, test_target)) [train_dataset,test_dataset] = map(\ Data.TensorDataset, [train_dataTS.float(),test_dataTS.float()], [train_targetTS.float(),test_targetTS.float()]) if not args.str_sampling: train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size) # ------------ Create model --------------- if args.input_type in ['image','EEGLearn_img']: model_param = [input_size] else: model_param = [train_data.shape] if not args.ensemble: model = read_model(args.model_name, model_param) else: pre_models = [] for i in range(args.num_split): pre_model = read_model(args.pre_model_name, model_param) pre_model.load_state_dict( torch.load('%s/last_model_exp%d_split%d.pt'%(args.ensemble, index_exp, i)) ) set_parameter_requires_grad(pre_model, True) pre_models.append(pre_model) model = models.__dict__[args.model_name](pre_models) print('Use model %s'%(args.model_name)) # Run on GPU model = model.to(device=device) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) # define loss function (criterion) and optimizer if args.loss_type == 'L2': criterion = nn.MSELoss().to(device=device) elif args.loss_type == 'L1': criterion = nn.L1Loss().to(device=device) elif args.loss_type == 'L4': criterion = L4Loss elif args.loss_type == 'MyLoss': criterion = MyLoss print('Use %s loss'%(args.loss_type)) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr_rate,momentum=0.9) #optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_rate) # Record loss and accuracy of each epoch dict_error = {'train_std': list(range(args.num_epoch)), 'test_std': list(range(args.num_epoch)), 'train_mape': list(range(args.num_epoch)), 'test_mape': list(range(args.num_epoch))} # optionally evaluate the trained model if args.evaluate: if args.resume: if os.path.isfile(args.resume): model.load_state_dict(torch.load(args.resume)) _, target, pred, _, _ = validate(test_loader, model, criterion) plot_scatter(target, pred, dirName, fileName) return 0 # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_error = checkpoint['best_error'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) dict_error['train_std'][:args.start_epoch] = checkpoint['dict_error']['train_std'] dict_error['test_std'][:args.start_epoch] = checkpoint['dict_error']['test_std'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # ------------- Train model ------------------ for epoch in range(args.start_epoch, args.num_epoch): # Create dataloader if using stratified sampler if args.str_sampling: sampler = SubsetRandomSampler(get_indices_RSS(train_target, int(0.5*len(train_target)))) train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, \ sampler=sampler, num_workers=4) # Learning rate decay if epoch in lr_step: for param_group in optimizer.param_groups: param_group['lr'] *= 0.1 # train for one epoch _, dict_error['train_std'][epoch], dict_error['train_mape'][epoch] = \ train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set _, _, _, std_error, dict_error['test_mape'][epoch] = validate(test_loader, model, criterion) dict_error['test_std'][epoch] = std_error # remember best standard error and save checkpoint is_best = std_error < best_error best_error = min(std_error, best_error) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_error': best_error, 'optimizer': optimizer.state_dict(), 'dict_error': dict_error }, is_best) # Save best model if is_best: torch.save(model.state_dict(), './results/%s/best_model_exp%d_split%d.pt'%(dirName, index_exp, index_split)) if epoch == args.num_epoch-1: torch.save(model.state_dict(), './results/%s/last_model_exp%d_split%d.pt'%(dirName, index_exp, index_split)) # Plot error curve plot_error(dict_error, dirName, fileName) # Plot scatter plots _, target, pred, _, _ = validate(test_loader, model, criterion) plot_scatter(target, pred, dirName, fileName) dict_error['target'], dict_error['pred'] = target, pred # Plot histogram import matplotlib.pyplot as plt plt.hist(target, label = 'True') plt.hist(pred, label = 'Pred') plt.legend(loc='upper right') plt.savefig('./results/hist.png') # Save error over epochs with open('./results/%s/%s.data'%(dirName, fileName), 'wb') as fp: pickle.dump(dict_error, fp)
if not classical: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(device) # Load data if args.input_type == 'signal': X, Y, _, S, D = raw_dataloader.read_data([1, 2, 3], range(11), channel_limit=21, rm_baseline=True) elif args.input_type == 'ERSP': with open('./raw_data/ERSP_from_raw_100_channel21.data', 'rb') as fp: dict_ERSP = pickle.load(fp) ERSP, Y, S, D = dict_ERSP['ERSP'], dict_ERSP['SLs'], dict_ERSP[ 'Sub_ID'], dict_ERSP['D'] X, Y = preprocessing.standardize(ERSP, Y, threshold=0.0) elif args.input_type == 'bp_ratio': X, Y, _, S, D = raw_dataloader.read_data([1, 2, 3], range(11), channel_limit=21, rm_baseline=True) low, high = [4, 7, 13], [7, 13, 30] X = bandpower.get_bandpower(X, low=low, high=high) X = add_features.get_bandpower_ratio(X) # Create folder for results of this model if not os.path.exists('./results/%s' % (args.dirName)): os.makedirs('./results/%s' % (args.dirName)) # LOSO or CV if args.cv_mode == 'LOSO':
if not os.path.exists('./results/%s' % (args.dirName)): os.makedirs('./results/%s' % (args.dirName)) # Load data if args.input_type == 'signal': data, SLs, _, S, D = raw_dataloader.read_data([1, 2, 3], range(11), channel_limit=21, rm_baseline=True) #data = np.random.rand(data.shape[0], data.shape[1], data.shape[2]) elif args.input_type == 'ERSP': with open('./raw_data/ERSP_from_raw_100_channel21.data', 'rb') as fp: dict_ERSP = pickle.load(fp) data, SLs, S, D = dict_ERSP['ERSP'], dict_ERSP['SLs'], dict_ERSP[ 'Sub_ID'], dict_ERSP['D'] data, SLs = preprocessing.standardize(data, SLs, threshold=0.0) elif args.input_type == 'bp_ratio': data, SLs, _, S, D = raw_dataloader.read_data([1, 2, 3], range(11), channel_limit=21, rm_baseline=True) low, high = [4, 7, 13], [7, 13, 30] data = bandpower.get_bandpower(data, low=low, high=high) data = add_features.get_bandpower_ratio(data) elif args.input_type == 'bandpower': data, SLs, _, S, D = raw_dataloader.read_data([1, 2, 3], range(11), channel_limit=21, rm_baseline=True) low, high = [4, 7, 13], [7, 13, 30] data = bandpower.get_bandpower(data, low=low, high=high)
[[150], [100], [0, 1, 10], [1], [1e-1], [1e-1], [5], [10]] # Set seed for reproducibility np.random.seed(98) # Iterate over each subset and build a model # The predictions of every single model are combined for i in range(num_subsets): # Extract the train/test subsets y_train_subset, X_train_subset, ids_train_subset = train_subsets[i] y_test_subset, X_test_subset, ids_test_subset = test_subsets[i] # Map the categorical output labels into [0, 1] y_train_subset = map_0_1(y_train_subset) # Standardize the data X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset) print( f"Train shape before feature expansion: {str(X_train_subset.shape):>12} Test shape: {str(X_test_subset.shape):>12}" ) # Build the polynomial features and expand the data X_train_subset, X_test_subset = build_poly(X_train_subset, max_degree[i]), build_poly( X_test_subset, max_degree[i]) print( f"Train shape after feature expansion: {str(X_train_subset.shape):>12} Test shape: {str(X_test_subset.shape):>12}" ) # Set n_best_features to X_train_subset.shape[1] if you don't want feature selection n_best_features = round(fs_perc[i] * X_train_subset.shape[1]) D = n_best_features
def main(): #Loading the Data # Training dataset DATA_TRAIN_PATH = '../data/train.csv' y, X, ids = load_csv_data(DATA_TRAIN_PATH) # Testing Dataset DATA_TEST_PATH = '../data/test.csv' y_t, X_t, ids_t = load_csv_data(DATA_TEST_PATH) #Separate training and testing sets into 4 different categories depending #on the PRI_jet_num feature with index -8 feature = -8 X_cat = preproc.get_categories(X, feature=feature) X_t_cat = preproc.get_categories(X_t, feature=feature) #looop for every v in range 4 to obtain the 4 predictions, #then concatenate and create submission file y_pred_all = [] # Found using cross_validation # Setting best hyperparameters (the degree and the corresponding lambda) for each category degrees = [10, 10, 9, 9] lambdas = [0.00047508101621, 7.05480231072e-07, 0.000343046928631, 5.72236765935e-05] for v in range(4): # Extract category (test, train and labels) Xv = X[X_cat[v]] Xv_t = X_t[X_t_cat[v]] y_v = y[X_cat[v]] #Concatenante the train and testing set all_Xv = np.concatenate((Xv, Xv_t), axis=0) # find features (bad_features) with a unique value bad_features = [] for i in range(len(all_Xv.T)): if(len(np.unique(all_Xv.T[i])) == 1): bad_features.append(i) # Delete bad_features and fill missing values all_Xv_c = X_v = np.delete(all_Xv, bad_features, axis=1) all_Xv_filled = preproc.fill_missing_values(all_Xv_c, tresh=1) #Separate train and test Xv_f = all_Xv_filled[:len(Xv)] Xv_t_f = all_Xv_filled[len(Xv):] #Standardize the dataset tXv, mean_x, std_x = preproc.standardize(Xv_f) tXv_t, mean_x, std_x = preproc.standardize(Xv_t_f) ### Generate model final_degree = degrees[v] best_lambda = lambdas[v] # Build the polynomial basis, perform ridge regression final_X = impl.build_poly(tXv, final_degree) final_Xt = impl.build_poly(tXv_t, final_degree) #Generate the model (Using Ridge Regression) final_w, loss_ = impl.ridge_regression(y_v, final_X, best_lambda) # Genereate prediction for this category y_predv = predict_labels(final_w, final_Xt) y_pred_all.append(y_predv) p = len(X_cat[v])/len(X) ### Concatenate all predictions, and sort them by indices Xt_cat_all = [idx for sublist in X_t_cat for idx in sublist] y_pred = [yi for sublist in y_pred_all for yi in sublist] final_ypred = np.asarray(y_pred)[np.argsort(Xt_cat_all)] #Create Submission file OUTPUT_PATH = '../submissions/results__4categories_fillByCat_run.csv' create_csv_submission(ids_t, final_ypred, OUTPUT_PATH) print('Congratulations ........ Submission file created ::: ', OUTPUT_PATH)
for s, sample in enumerate(data): augData[s, :] = sample + (components * coeffs.reshape( (n_components, -1))).sum(axis=0) else: # Add Gaussian noise with std determined by weighted std of each feature for f, feat in enumerate(data.transpose()): augData[:, f] = feat + np.random.normal(scale=stdMult * np.std(feat), size=feat.size) return augData if __name__ == '__main__': ERSP_all, tmp_all, freqs = dataloader.load_data() ERSP_all, SLs = preprocessing.standardize(ERSP_all, tmp_all, threshold=0.0) num_channels = ERSP_all.shape[1] ERSP_all, SLs = preprocessing.remove_trials(ERSP_all, SLs, 60.0) num_example = ERSP_all.shape[0] # Concatenate theta, alpha, beta low, high = [4, 7, 13], [7, 13, 30] for i in range(len(low)): bp_i = preprocessing.bandpower(ERSP_all, freqs, [low[i]], [high[i]]).reshape((num_example, -1)) if i == 0: bp_all = bp_i else: bp_all = np.concatenate((bp_all, bp_i), axis=1)
def run_benchmark( learners_obj, learners_name, # For generating reports n_runs=30, n_folds=5, noise_level=0.0, preprocessing='none', report_name="benchmark.pdf", verbose=False): """Run benchmark tests to evaluate a method and generate a latex-compatible report. Parameters: ----------- learners_obj: a set of object. list or tuple objects of methods to be tested learners_name: a set of string. list of tuple n_runs: int default 30 Runs of cross-validation n_folds: int default 5 Number of folds. Must be at least 2. Typically 5 or 10 noise_level: number default 0.0 Percentage of samples labels of which are flipped preprocessing: str. default 'none' Preprocessing method. Must be 'none' or 'normalize' or 'standardize' report_name: string default "benchmark.txt" Name of a report that will be generated. verbose: bool default False If true, display progress """ # Check parameters # Run benchmark tests # ----------------------------------------------------------------------------# # Result format # # # # ROW 0 : ds_name, run, cv_fold, method_index, train_error, test_error # # ROW 1 : ds_name, run, cv_fold, method_index, train_error, test_error # # # # ROW k : ds_name, run, cv_fold, method_index, train_error, test_error # # # # ----------------------------------------------------------------------------# benchmark_results = [] ds_count = -1 for ds in g_benchmark_datasets: ds_count += 1 X, y, n_samples, n_features = _load_data(ds, noise_level) for irun in range(n_runs): # Cross-validation kf = KFold(n_samples, n_folds=n_folds, shuffle=True) fold_count = -1 for train_idx, test_idx in kf: fold_count += 1 X_train = X[train_idx, :] y_train = y[train_idx] X_test = X[test_idx, :] y_test = y[test_idx] if preprocessing == 'standardize': X_train = standardize(X_train) X_test = standardize(X_test) if preprocessing == 'normalize': X_train = normalize(X_train) X_test = normalize(X_test) # Call learners method_count = -1 for learner in learners_obj: method_count += 1 lcopy = copy.deepcopy(learner) lcopy.fit(X_train, y_train) train_err = 1.0 - lcopy.score(X_train, y_train) test_err = 1.0 - lcopy.score(X_test, y_test) result = [ ds_count, irun, fold_count, method_count, train_err, test_err ] benchmark_results.append(result) if verbose: print 'dataset: {0} run: {1} cv_fold: {2} ' \ 'method: {3} train_error: {4} test_error: {5}' \ .format(ds, irun, fold_count, \ method_count, train_err, test_err) #print result # Save results print 'Saving results ... ' global g_result_top_dir g_result_top_dir = os.path.join(os.getcwd(), "benchmark_results") if not os.path.exists(g_result_top_dir): os.mkdir(g_result_top_dir) if not (".pdf" in report_name.lower()): report_name += '.pdf' dir_name = report_name.replace('.pdf', '') if not os.path.exists(os.path.join(g_result_top_dir, dir_name)): os.mkdir(os.path.join(g_result_top_dir, dir_name)) datetime = time.strftime("%H%M%S%d_%m_%Y") result_file_name = os.path.join(os.path.join(g_result_top_dir, dir_name), "benchmark_results_" + datetime) np.save(result_file_name, benchmark_results) # Save meta information: names of datasets and methods for generating a report. np.save(result_file_name + '_methods_name', learners_name) np.save(result_file_name + '_datasets_name', g_benchmark_datasets) # Generate report print 'Generating reports ... ' _generate_report(report_name=report_name, result_file=result_file_name, methods_name=learners_name, n_runs=n_runs, n_folds=n_folds)
def get_data(use_preexisting=True, save_preprocessed=True, z_outlier=False, feature_expansion=False, correlation_analysis=False, class_equalizer=False, M=4, z_value=3.0): """ Data supplying function. This function has the purpose of loading data and applying preprocessing. It includes many features such as downloading the data from the github repository, saving the data (for fast reuse), applying different preprocessing algorithms, etc... Args: use_preexisting (bool): if existent, enabling this parameters will allow the function to use previously preprocessed and saved data files save_preprocessed (bool): enabling this parameters will allow the function to save the preprocessed data z_outlier (Union[int, bool]): enabling this parameters will allow the function to perform z outlier detection feature_expansion (bool): enabling this parameters will allow the function to perform exponential feature expansion correlation_analysis (Union[int, bool]): enabling this parameters will allow the function to perform correlation analysis and remove highly correlated features class_equalizer (Union[int, bool]): enabling this parameters will allow the function to perform class balancing M (Union[int, list]): feature expansion parameter per group z_value (Union[float, list]): outlier detection threshold per group Returns: list: groups of training samples list: corresponding groups of training labels list: corresponding indexes of affiliated training ows list: groups of test samples list: corresponding groups of test labels list: corresponding indexes of affiliated test rows list: list of indexes of testing (for creating submissions) """ if os.path.isdir(config.DATA_PATH) and os.path.isdir( config.PREPROCESSED_PATH) and use_preexisting: print("[*] Using previously preprocessed Data") groups_tr_X = np.load(config.PREPROCESSED_X_TR_GROUPS_NPY, allow_pickle=True) groups_tr_Y = np.load(config.PREPROCESSED_Y_TR_GROUPS_NPY, allow_pickle=True) indc_list_tr = np.load(config.PREPROCESSED_GROUP_INDEX_TR_NPY, allow_pickle=True) groups_te_X = np.load(config.PREPROCESSED_X_TE_GROUPS_NPY, allow_pickle=True) groups_te_Y = np.load(config.PREPROCESSED_Y_TE_GROUPS_NPY, allow_pickle=True) indc_list_te = np.load(config.PREPROCESSED_GROUP_INDEX_TE_NPY, allow_pickle=True) ids_te = np.load(config.PREPROCESSED_IDS_TE_GROUPS_NPY, allow_pickle=True) else: if not (os.path.isdir(config.DATA_PATH) and os.path.isfile(config.TRAIN_DATA_CSV_PATH) and os.path.isfile(config.TEST_DATA_CSV_PATH)): Path(config.DATA_PATH).mkdir(exist_ok=True) download_url(config.TRAIN_URL, config.TRAIN_DATA_CSV_PATH) download_url(config.TEST_URL, config.TEST_DATA_CSV_PATH) print("[*] Creating preprocessed Data") # load data from csv filesconfig.Z_VALUE Y_tr, X_tr, ids_tr = load_csv_data(config.TRAIN_DATA_CSV_PATH) Y_te, X_te, ids_te = load_csv_data(config.TEST_DATA_CSV_PATH) groups_tr_Y, groups_tr_X, indc_list_tr = split_groups(Y_tr, X_tr) groups_te_Y, groups_te_X, indc_list_te = split_groups(Y_te, X_te) nr_groups_tr = len(indc_list_tr) # make to lists z_outlier = make_to_list(z_outlier) class_equalizer = make_to_list(class_equalizer) correlation_analysis = make_to_list(correlation_analysis) M = make_to_list(M) for indx in range(nr_groups_tr): # perform z outlier detection if z_outlier[indx]: groups_tr_X[indx] = z_score_outlier_detection( groups_tr_X[indx], thresh=z_value) groups_te_X[indx] = z_score_outlier_detection( groups_te_X[indx], thresh=z_value) # perform correlation analysis if correlation_analysis[indx]: groups_tr_X[indx], columns_to_keep = corr_filter( groups_tr_X[indx], threshold=0.95) groups_te_X[indx] = groups_te_X[indx][:, columns_to_keep] # perform class equalization if class_equalizer[indx]: groups_tr_X[indx], groups_tr_Y[ indx] = class_imbalance_equalizer(groups_tr_X[indx], groups_tr_Y[indx]) # perform feature expansion if feature_expansion: groups_tr_X[indx] = augment_features_polynomial( groups_tr_X[indx], M=M[indx]) groups_te_X[indx] = augment_features_polynomial( groups_te_X[indx], M=M[indx]) # standardize features groups_tr_X[indx] = standardize(groups_tr_X[indx]) groups_te_X[indx] = standardize(groups_te_X[indx]) # add bias groups_tr_X[indx] = add_bias(groups_tr_X[indx]) groups_te_X[indx] = add_bias(groups_te_X[indx]) print(f"\t [+]Group {indx + 1} finished!") if save_preprocessed: Path(config.PREPROCESSED_PATH).mkdir(exist_ok=True) np.save(config.PREPROCESSED_X_TR_GROUPS_NPY, groups_tr_X, allow_pickle=True) np.save(config.PREPROCESSED_Y_TR_GROUPS_NPY, groups_tr_Y, allow_pickle=True) np.save(config.PREPROCESSED_X_TE_GROUPS_NPY, groups_te_X, allow_pickle=True) np.save(config.PREPROCESSED_Y_TE_GROUPS_NPY, groups_te_Y, allow_pickle=True) np.save(config.PREPROCESSED_GROUP_INDEX_TR_NPY, indc_list_tr, allow_pickle=True) np.save(config.PREPROCESSED_GROUP_INDEX_TE_NPY, indc_list_te, allow_pickle=True) np.save(config.PREPROCESSED_IDS_TE_GROUPS_NPY, ids_te, allow_pickle=True) print("[+] Saved Preprocessed Data") return groups_tr_X, groups_tr_Y, indc_list_tr, groups_te_X, groups_te_Y, indc_list_te, ids_te
print("Before:\n", data, "\n") assigned_time = prep.assign_time(data, "1/10/2019", 1) print("With Assigned Time:\n") print(assigned_time) # --------------------- print("--------Preprocessing Test--------") print(" -difference()- ") test_data = prep.read_from_file("../TestData/AtmPres2005NovMin.csv") difference_ts = prep.difference(test_data) print(difference_ts) # --------------------- print("--------Preprocessing Test--------") print(" -scaling()- ") scaled = prep.scaling(assigned_time) print(scaled) # ---------------- print("--------Preprocessing Test--------") print(" -standardize()- ") standardized = prep.standardize(test_data) print(standardized, "\n") # ---------------- print("--------Preprocessing Test--------") print(" -logarithm()- ") log = prep.logarithm(test_data) print(log, "\n") # ---------------- print("--------Preprocessing Test--------") print(" -cubic()- ") cubed = prep.cubic_roots(test_data) print(cubed)