def __init__(self, trainset, testset, net, _id=None, # TODO: multiple GPU ): self.trainset = trainset if self.trainset is not None: self.trainloader = torch.utils.data.DataLoader(dataset=self.trainset, batch_size=4, shuffle=True, num_workers=2) self.testset = testset if self.testset is not None: self.testloader = torch.utils.data.DataLoader(dataset=self.testset, batch_size=4, shuffle=False, num_workers=2) self.net = net self.criterion = nn.CrossEntropyLoss() self.optimizer = optim.SGD(self.net.parameters(), lr=0.001, momentum=0.9) assert(_id != None) self._id = _id # TODO: assert error, global_id self.PATH = "clients/" + str(self._id) + "/" # TODO: the other PATH for log recursive_mkdir(self.PATH)
def calc_and_plot(data, optimizer, norm, lr, epochs, iteration, batch_size): data_dir = f'{batch_size}-weights-{optimizer}-{lr}-{norm}' calc(data, optimizer, norm, lr, epochs, iteration, batch_size) plot_parent_dir = './plots' for i in range(0, 4): data_path = f'{data_dir}/store-{i}.pkl' plot_path = f'{plot_parent_dir}/{data_dir}/layer{i}' recursive_mkdir(plot_path) plot(plot_path, data_path)
def main(): iteration = 0 for data in configs['data']: for norm in configs['norm']: for optimizer in configs['optimizer']: for lr in configs['lr']: iteration += 1 dir = f'{data}-weights-{optimizer}-{lr}-{norm}' data_dir = f'{dir}/store.pkl' plot_dir = f'./plots/{dir}' calc(data, optimizer, norm, lr, epochs, iteration) recursive_mkdir(plot_dir) plot(plot_dir, data_dir)
def main(args): print('available GPUs:', K.tensorflow_backend._get_available_gpus()) # create checkpoint ck_path = './checkpoints/' + args.exp_nm if not os.path.exists(ck_path): recursive_mkdir(ck_path) all_res_path = os.path.join(ck_path, 'result_summary.csv') split_index_dict = get_split_index_dict() for fold in range(num_folds): log_dir = os.path.join(ck_path, 'fold_' + str(fold) + '/') #skip if exists if not os.path.exists(log_dir): os.mkdir(log_dir) else: continue train_patient_indexes = split_index_dict[str( fold)]['train_patient_indexes'] val_patient_indexes = split_index_dict[str( fold)]['val_patient_indexes'] fold_mean_score = train( log_dir=log_dir, fold=fold, train_patient_indexes=train_patient_indexes, #TODO val_patient_indexes=val_patient_indexes, data_file_path=args.data_file_path ) #for each fold of the 5, train & validate the model and return mean score, mean score is a dictionary fold_mean_score['fold'] = fold res_df = pd.DataFrame.from_dict(fold_mean_score) write_header = True if not os.path.exists( all_res_path) else False # write header res_df.to_csv(all_res_path, mode='a', index=False, header=write_header) # calculate average score print('Final score from ', num_folds, ' folds cross validation saved to ', all_res_path)
def run(data_fn, prop_missing=0., max_num_feature=-1, feature_selection='random', k=10, data_dir='_data', out_dir='_out'): """Run RIDDLE classification interpretation pipeline. Arguments: data_fn: string data file filename prop_missing: float proportion of feature observations which should be randomly masked; values in [0, 1) max_num_feature: int maximum number of features to use feature_selection: string feature selection method; values = {'random', 'frequency', 'chi2'} k: int number of partitions for k-fold cross-validation interpret_model: bool whether to interpret the trained model for first k-fold partition which_half: str which half of experiments to do; values = {'first', 'last', 'both'} data_dir: string directory where data files are located cache_dir: string directory where cached files (e.g., saved parameters) are located out_dir: string outer directory where outputs (e.g., results) should be saved """ from keras.models import load_model from riddle import emr, feature_importance from riddle.models import MLP start = time.time() base_out_dir = get_base_out_dir(out_dir, 'riddle', data_fn, prop_missing, max_num_feature, feature_selection) recursive_mkdir(base_out_dir) # get common data x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict, perm_indices = ( get_preprocessed_data(data_dir, data_fn, prop_missing=prop_missing)) num_feature = len(idx_feat_dict) num_class = len(idx_class_dict) list_sums_D, list_sums_D2, list_sums_contribs = [], [], [] for k_idx in range(k): full_out_dir = '{}/k_idx={}'.format(base_out_dir, k_idx) print('\nPartition k = {}'.format(k_idx)) x_train_unvec, y_train, _, _, x_test_unvec, y_test = emr.get_k_fold_partition( x_unvec, y, k_idx=k_idx, k=k, perm_indices=perm_indices) if max_num_feature > 0: # select features and re-encode feat_encoding_dict, idx_feat_dict = select_features( x_train_unvec, y_train, idx_feat_dict, method=feature_selection, num_feature=num_feature, max_num_feature=max_num_feature) x_test_unvec = subset_reencode_features(x_test_unvec, feat_encoding_dict) num_feature = max_num_feature # interpret start = time.time() temp_mlp = MLP(num_feature=num_feature, num_class=num_class) hdf5_path = full_out_dir + '/model.h5' sums_D, sums_D2, sums_contribs, pairs = \ feature_importance.get_diff_sums( hdf5_path, x_test_unvec, process_x_func=temp_mlp.process_x, num_feature=num_feature, num_class=num_class) with open(full_out_dir + '/sums_D.pkl', 'wb') as f: pickle.dump(sums_D, f) with open(full_out_dir + '/sums_D2.pkl', 'wb') as f: pickle.dump(sums_D2, f) with open(full_out_dir + '/sums_contribs.pkl', 'wb') as f: pickle.dump(sums_contribs, f) list_sums_D.append(sums_D) list_sums_D2.append(sums_D2) list_sums_contribs.append(sums_contribs) def compute_total_sums(list_sums): total_sums = list_sums[0] for i in range(1, len(list_sums)): for j in range(len(total_sums)): total_sums[j] = np.add(total_sums[j], list_sums[i][j]) return total_sums total_sums_D = compute_total_sums(list_sums_D) total_sums_D2 = compute_total_sums(list_sums_D2) total_sums_contribs = compute_total_sums(list_sums_contribs) num_sample = len(x_unvec) run_interpretation_summary(x_unvec, y, total_sums_D, total_sums_D2, total_sums_contribs, idx_feat_dict=idx_feat_dict, idx_class_dict=idx_class_dict, icd9_descript_dict=icd9_descript_dict, pairs=pairs, num_sample=num_sample, full_out_dir=base_out_dir) print('Computed DeepLIFT scores and analysis in {:.4f} seconds'.format( time.time() - start)) print('-' * 72) print()
def run_kfold(data_fn, method='logit', prop_missing=0., max_num_feature=-1, feature_selection='random', k=10, max_num_sample=10000, num_search=30, data_dir='_data', cache_dir='_cache', force_run=False): """Run several parameter searches a la k-fold cross-validation. Arguments: data_fn: string data file filename method: string name of classification method; values = {'logit', 'random_forest', 'linear_svm', 'poly_svm', 'rbf_svm', 'gbdt', 'riddle'} prop_missing: float proportion of feature observations which should be randomly masked; values in [0, 1) max_num_feature: int maximum number of features to use feature_selection: string feature selection method; values = {'random', 'frequency', 'chi2'} k: int number of partitions for k-fold cross-validation max_num_sample: int maximum number of samples to use num_search: int number of searches (parameter configurations) to try for each partition data_dir: string directory where data files are located cache_dir: string directory where cached files (e.g., saved parameters) are located out_dir: string directory where outputs (e.g., results) should be saved """ if 'debug' in data_fn: num_search = 3 # check if already did param search, if so, skip param_path = get_param_path(cache_dir, method, data_fn, prop_missing, max_num_feature, feature_selection) if not force_run and os.path.isfile(param_path): warnings.warn( 'Already did search for {}, skipping the search'.format(method)) return x_unvec, y, idx_feat_dict, idx_class_dict, _, perm_indices = ( get_preprocessed_data(data_dir, data_fn, prop_missing=prop_missing)) num_feature = len(idx_feat_dict) num_class = len(idx_class_dict) params = {} for k_idx in range(0, k): params[k_idx] = run(method, x_unvec, y, idx_feat_dict, num_feature=num_feature, max_num_feature=max_num_feature, num_class=num_class, max_num_sample=max_num_sample, feature_selection=feature_selection, k_idx=k_idx, k=k, num_search=num_search, perm_indices=perm_indices) recursive_mkdir(FLAGS.cache_dir) with open(param_path, 'wb') as f: # save pickle.dump(params, f) print('Finished parameter search for method: {}'.format(method))
def run_kfold(data_fn, method='logit', prop_missing=0., max_num_feature=-1, feature_selection='random', k=10, which_half='both', data_dir='_data', cache_dir='_cache', out_dir='_out'): """Run several classification pipelines a la k-fold cross-validation. Arguments: data_fn: string data file filename method: string name of classification method; values = {'logit', 'random_forest', 'linear_svm', 'poly_svm', 'rbf_svm', 'gbdt'} prop_missing: float proportion of feature observations which should be randomly masked; values in [0, 1) max_num_feature: int maximum number of features to use feature_selection: string feature selection method; values = {'random', 'frequency', 'chi2'} k: int number of partitions for k-fold cross-validation which_half: str which half of experiments to do; values = {'first', 'last', 'both'} data_dir: string directory where data files are located cache_dir: string directory where cached files (e.g., saved parameters) are located out_dir: string directory where perm_indices: np.ndarray, int array of indices representing a permutation of the samples with shape (num_sample, ) init_args: {string: ?} dictionary mapping initialization argument names to values out_dir: string directory where outputs (e.g., results) should be saved """ start = time.time() try: # load saved parameters param_path = get_param_path(cache_dir, method, data_fn, prop_missing, max_num_feature, feature_selection) with open(param_path, 'rb') as f: params = pickle.load(f) except: warnings.warn('Cannot load parameters from: {}\n'.format(param_path) + 'Need to do parameter search; run parameter_search.py') raise # TODO(jisungkim) handle binary and multiclass separately, don't assume # multiclass! if method == 'logit': from sklearn.linear_model import LogisticRegression as ModelClass init_args = {'multi_class': 'multinomial', 'solver': 'lbfgs'} elif method == 'random_forest': from sklearn.ensemble import RandomForestClassifier as ModelClass init_args = {} elif method == 'linear_svm': from sklearn.svm import SVC as ModelClass # remark: due to a bug in scikit-learn / libsvm, the sparse 'linear' # kernel is much slower than the sparse 'poly' kernel, so we use # the 'poly' kernel with degree=1 over the 'linear' kernel init_args = { 'kernel': 'poly', 'degree': 1, 'coef0': 0., 'gamma': 1., 'probability': True, 'cache_size': 1000 } elif method == 'poly_svm': from sklearn.svm import SVC as ModelClass init_args = {'kernel': 'poly', 'probability': True, 'cache_size': 1000} elif method == 'rbf_svm': from sklearn.svm import SVC as ModelClass init_args = {'kernel': 'rbf', 'probability': True, 'cache_size': 1000} elif method == 'gbdt': from xgboost import XGBClassifier as ModelClass init_args = {'objective': 'multi:softprob'} else: raise ValueError('unknown method: {}'.format(method)) x_unvec, y, idx_feat_dict, idx_class_dict, _, perm_indices = ( get_preprocessed_data(data_dir, data_fn, prop_missing=prop_missing)) num_feature = len(idx_feat_dict) num_class = len(idx_class_dict) base_out_dir = get_base_out_dir(out_dir, method, data_fn, prop_missing, max_num_feature, feature_selection) recursive_mkdir(base_out_dir) if which_half == 'both': loop = range(0, k) elif which_half == 'first': loop = range(0, k / 2) elif which_half == 'last': loop = range(k / 2, k) else: raise ValueError('Unknown which_half: {}'.format(which_half)) for k_idx in loop: sub_out_dir = '{}/k_idx={}'.format(base_out_dir, k_idx) recursive_mkdir(sub_out_dir) run(ModelClass, x_unvec, y, idx_feat_dict, num_feature=num_feature, max_num_feature=max_num_feature, num_class=num_class, feature_selection=feature_selection, k_idx=k_idx, k=k, params=params, perm_indices=perm_indices, init_args=init_args, full_out_dir=sub_out_dir) print('This k-fold {} multipipeline run script took {:.4f} seconds'.format( method, time.time() - start))
def run_kfold(data_fn, prop_missing=0., max_num_feature=-1, feature_selection='random', k=10, which_half='both', data_dir='_data', cache_dir='_cache', out_dir='_out'): """Run several RIDDLE classification pipelines a la k-fold cross-validation. Arguments: data_fn: string data file filename prop_missing: float proportion of feature observations which should be randomly masked; values in [0, 1) max_num_feature: int maximum number of features to use feature_selection: string feature selection method; values = {'random', 'frequency', 'chi2'} k: int number of partitions for k-fold cross-validation which_half: str which half of experiments to do; values = {'first', 'last', 'both'} data_dir: string directory where data files are located cache_dir: string directory where cached files (e.g., saved parameters) are located out_dir: string outer directory where outputs (e.g., results) should be saved """ start = time.time() base_out_dir = get_base_out_dir(out_dir, 'riddle', data_fn, prop_missing, max_num_feature, feature_selection) recursive_mkdir(base_out_dir) # get common data x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict, perm_indices = ( get_preprocessed_data(data_dir, data_fn, prop_missing=prop_missing)) num_feature = len(idx_feat_dict) num_class = len(idx_class_dict) # print/save value-sorted dictionary of classes and features class_mapping = sorted(idx_class_dict.items(), key=lambda key: key[0]) with open(base_out_dir + '/class_mapping.txt', 'w') as f: print(class_mapping, file=f) with open(base_out_dir + '/feature_mapping.txt', 'w') as f: for idx, feat in idx_feat_dict.items(): f.write('{}\t{}\n'.format(idx, feat)) try: # load saved parameters param_path = get_param_path(cache_dir, 'riddle', data_fn, prop_missing, max_num_feature, feature_selection) with open(param_path, 'rb') as f: params = pickle.load(f) # for legacy compatability new_params = {} for k_idx, param in params.items(): if 'nb_hidden_layers' in param: param['num_hidden_layer'] = param.pop('nb_hidden_layers') if 'nb_hidden_nodes' in param: param['num_hidden_node'] = param.pop('nb_hidden_nodes') new_params[k_idx] = param params = params except: warnings.warn('Cannot load parameters from: {}\n'.format(param_path) + 'Need to do parameter search; run parameter_search.py') raise if which_half == 'both': loop = range(0, k) elif which_half == 'first': loop = range(0, k / 2) elif which_half == 'last': loop = range(k / 2, k) else: raise ValueError('Unknown which_half: {}'.format(which_half)) for k_idx in loop: sub_out_dir = '{}/k_idx={}'.format(base_out_dir, k_idx) recursive_mkdir(sub_out_dir) run(x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict, num_feature=num_feature, max_num_feature=max_num_feature, num_class=num_class, feature_selection=feature_selection, k_idx=k_idx, k=k, params=params, perm_indices=perm_indices, full_out_dir=sub_out_dir) print('This k-fold riddle multipipeline run script took {:.4f} seconds' .format(time.time() - start))