def main(args): try: method = args[1].lower() except: method = 'lrfc' eprint('Using default method = \'{}\''.format(method)) try: data_fn = args[2] except: data_fn = 'dummy.txt' eprint('Using default data_fn = \'{}\''.format(data_fn)) try: which_half = args[3].lower() except: which_half = 'both' eprint('Using default which_half = {}'.format(which_half)) try: prop_missing = float(args[4]) except: prop_missing = 0.0 eprint('Using default prop_missing = {}'.format(prop_missing)) # not going to finish in time, so skip nonlinear svm if using full dataset skip_nonlinear_svm = 'final-100.txt' in data_fn if skip_nonlinear_svm: eprint('Skipping SVMs with non-linear kernels') run(data_fn, method=method, which_half=which_half, prop_missing=prop_missing, skip_nonlinear_svm=skip_nonlinear_svm)
def get_base_data(data_path, prop_missing): icd9_descript_path = '{}/{}'.format(DATA_DIR, 'phewas_codes.txt') # load data print('Loading data...') start = time.time() # get common data icd9_descript_dict = emr.get_icd9_descript_dict(icd9_descript_path) X, y, idx_feat_dict, idx_class_dict = emr.get_data( path=data_path, icd9_descript_dict=icd9_descript_dict, prop_missing=prop_missing) nb_features = len(idx_feat_dict) nb_classes = len(idx_class_dict) nb_cases = len(X) print('Data loaded in {:.5f} s'.format(time.time() - start)) print() # shuffle indices perm_indices = np.random.permutation(nb_cases) try: # try validating shuffled indices with open(data_path + '_perm_indices.pkl', 'r') as f: exp_perm_indices = pickle.load(f) assert np.all(perm_indices == exp_perm_indices) except: eprint('file not found ' + data_path + '_perm_indices.pkl') eprint('not doing perm_indices check') return X, y, perm_indices, nb_features, nb_classes
def get_base_data(data_path, prop_missing): icd9_descript_path = '{}/{}'.format(DATA_DIR, 'phewas_codes.txt') # load data print('Loading data...') start = time.time() # get common data icd9_descript_dict = emr.get_icd9_descript_dict(icd9_descript_path) X, y, idx_feat_dict, idx_class_dict = emr.get_data(path=data_path, icd9_descript_dict=icd9_descript_dict, prop_missing=prop_missing) nb_features = len(idx_feat_dict) nb_classes = len(idx_class_dict) nb_cases = len(X) print('Data loaded in {:.5f} s'.format(time.time() - start)) print() # shuffle indices perm_indices = np.random.permutation(nb_cases) try: # try validating shuffled indices with open(data_path + '_perm_indices.pkl', 'r') as f: exp_perm_indices = pickle.load(f) assert np.all(perm_indices == exp_perm_indices) except: eprint('file not found ' + data_path + '_perm_indices.pkl') eprint('not doing perm_indices check') return X, y, perm_indices, nb_features, nb_classes
def main(args): try: method = args[1].lower() except: method = 'lrfc' eprint('Using default method = \'{}\''.format(method)) try: data_fn = args[2] except: data_fn = 'dummy.txt' eprint('Using default data_fn = \'{}\''.format(data_fn)) try: prop_missing = float(args[3]) except: prop_missing = 0.0 eprint('Using default prop_missing = {}'.format(prop_missing)) # not going to finish in time, so skip nonlinear svm if using full dataset skip_nonlinear_svm = 'final-100.txt' in data_fn run(data_fn, method=method, prop_missing=prop_missing, skip_nonlinear_svm=skip_nonlinear_svm)
def run(data_fn, method='lrfc', which_half='both', prop_missing=0.0, k=10, skip_nonlinear_svm=False, nb_searches=20): data_path = '{}/{}'.format(DATA_DIR, data_fn) def get_results_dir(method, k_idx): base_folder = 'out/more/{}_{}_{}'.format(method, data_fn, prop_missing) folder = '{}/{}_idx_partition'.format(base_folder, k_idx) if not os.path.exists('out'): os.makedirs('out') if not os.path.exists('out/more'): os.makedirs('out/models') if not os.path.exists(base_folder): os.makedirs(base_folder) if not os.path.exists(folder): os.makedirs(folder) return folder try: # load saved parameters get_param_fn = lambda x: '{}/{}_{}_{}_param.pkl'.format( CACHE_DIR, x, data_fn, prop_missing) if method == 'lrfc': with open(get_param_fn('logit'), 'r') as f: logit_params = pickle.load(f) with open(get_param_fn('rfc'), 'r') as f: rfc_params = pickle.load(f) elif method == 'svm': with open(get_param_fn('linear-svm'), 'r') as f: linear_svm_params = pickle.load(f) if not skip_nonlinear_svm: with open(get_param_fn('poly-svm'), 'r') as f: poly_svm_params = pickle.load(f) with open(get_param_fn('rbf-svm'), 'r') as f: rbf_svm_params = pickle.load(f) else: raise ValueError('unknown method: {}'.format(method)) except: eprint('Need to do parameter search!') eprint('Please run `parameter_search.py` with the relevant' + 'command line arguments') raise X, y, perm_indices, nb_features, nb_classes = get_base_data( data_path, prop_missing) losses = { 'logit': [], 'rfc': [], 'linear-svm': [], 'poly-svm': [], 'rbf-svm': [] } accs = { 'logit': [], 'rfc': [], 'linear-svm': [], 'poly-svm': [], 'rbf-svm': [] } runtimes = { 'logit': [], 'rfc': [], 'linear-svm': [], 'poly-svm': [], 'rbf-svm': [] } if which_half == 'first': loop_seq = range(0, k / 2) elif which_half == 'last': loop_seq = range(k / 2, k) elif which_half == 'both': loop_seq = range(0, k) else: raise ValueError( '`which_half` must be \'first\', \'last\' or \'both\'') for k_idx in loop_seq: print('-' * 72) print('Partition k = {}'.format(k_idx)) data_partition_dict = emr.get_k_fold_partition( X, y, k_idx=k_idx, k=k, perm_indices=perm_indices) X_train = data_partition_dict['X_train'] y_train = data_partition_dict['y_train'] X_val = data_partition_dict['X_val'] y_val = data_partition_dict['y_val'] X_test = data_partition_dict['X_test'] y_test = data_partition_dict['y_test'] selected_feat_indices = select_feats(X_train + X_val, y_train + y_val, nb_features=nb_features) X_train, y_train = preproc_for_sklearn(X_train, y_train, nb_features) X_test, y_test = preproc_for_sklearn(X_test, y_test, nb_features) old_nb_features = len(X_train[0]) X_train = X_train[:, selected_feat_indices] X_test = X_test[:, selected_feat_indices] nb_features = len(X_train[0]) # extraneous but for future utility print('Reduced features from {} to {}'.format(old_nb_features, nb_features)) if method == 'lrfc': from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier # logistic regression start = time.time() logit = LogisticRegression(multi_class='multinomial', solver='lbfgs', **logit_params[k_idx]) logit.fit(X_train, y_train) logit_acc = accuracy_score(y_test, logit.predict(X_test)) logit_y_test_proba = logit.predict_proba(X_test) logit_loss = log_loss(y_test, logit_y_test_proba) logit_time = time.time() - start print( 'Logistic regression / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s' .format(logit_loss, logit_acc, logit_time)) # random forest classifier start = time.time() rfc = RandomForestClassifier(**rfc_params[k_idx]) rfc.fit(X_train, y_train) rfc_acc = accuracy_score(y_test, rfc.predict(X_test)) rfc_y_test_proba = rfc.predict_proba(X_test) rfc_loss = log_loss(y_test, rfc_y_test_proba) rfc_time = time.time() - start print( 'Random forest / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s' .format(rfc_loss, rfc_acc, rfc_time)) save_test_results( logit_y_test_proba, y_test, '{}/test_results.txt'.format(get_results_dir('logit', k_idx))) save_test_results( rfc_y_test_proba, y_test, '{}/test_results.txt'.format(get_results_dir('rfc', k_idx))) # joblib.dump(logit, get_results_dir('logit', k_idx) + '/clf.pkl') # joblib.dump(rfc, get_results_dir('rfc', k_idx) + '/clf.pkl') losses['logit'].append(logit_loss) accs['logit'].append(logit_acc) runtimes['logit'].append(logit_time) losses['rfc'].append(rfc_loss) accs['rfc'].append(rfc_acc) runtimes['rfc'].append(rfc_time) elif method == 'svm': from sklearn.svm import SVC # linear SVM start = time.time() linear_svm = SVC(kernel='linear', probability=True, **linear_svm_params[k_idx]) linear_svm.fit(X_train, y_train) linear_svm_acc = accuracy_score(y_test, linear_svm.predict(X_test)) linear_svm_y_test_proba = linear_svm.predict_proba(X_test) linear_svm_loss = log_loss(y_test, linear_svm_y_test_proba) linear_svm_time = time.time() - start print( 'Linear SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s' .format(linear_svm_acc, linear_svm_loss, linear_svm_time)) save_test_results( linear_svm_y_test_proba, y_test, '{}/test_results.txt'.format( get_results_dir('linear-svm', k_idx))) # joblib.dump(linear_svm, get_results_dir('linear-svm', k_idx) + '/clf.pkl') losses['linear-svm'].append(linear_svm_loss) accs['linear-svm'].append(linear_svm_acc) runtimes['linear-svm'].append(linear_svm_time) if skip_nonlinear_svm: continue # skip # polynomial SVM start = time.time() poly_svm = SVC(kernel='poly', probability=True, **poly_svm_params[k_idx]) poly_svm.fit(X_train, y_train) poly_svm_acc = accuracy_score(y_test, poly_svm.predict(X_test)) poly_svm_y_test_proba = poly_svm.predict_proba(X_test) poly_svm_loss = log_loss(y_test, poly_svm_y_test_proba) poly_svm_time = time.time() - start print( 'Polynomial SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s' .format(poly_svm_acc, poly_svm_loss, poly_svm_time)) # RBF SVM start = time.time() rbf_svm = SVC(kernel='rbf', probability=True, **rbf_svm_params[k_idx]) rbf_svm.fit(X_train, y_train) rbf_svm_acc = accuracy_score(y_test, rbf_svm.predict(X_test)) rbf_svm_y_test_proba = rbf_svm.predict_proba(X_test) rbf_svm_loss = log_loss(y_test, rbf_svm_y_test_proba) rbf_svm_time = time.time() - start print('RBF SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s'. format(rbf_svm_acc, rbf_svm_loss, rbf_svm_time)) save_test_results( poly_svm_y_test_proba, y_test, '{}/test_results.txt'.format( get_results_dir('poly-svm', k_idx))) save_test_results( rbf_svm_y_test_proba, y_test, '{}/test_results.txt'.format(get_results_dir('rbf-svm', k_idx))) # joblib.dump(poly_svm, get_results_dir('poly-svm', k_idx) + '/clf.pkl') # joblib.dump(rbf_svm, get_results_dir('rbf-svm', k_idx) + '/clf.pkl') losses['poly-svm'].append(poly_svm_loss) accs['poly-svm'].append(poly_svm_acc) runtimes['poly-svm'].append(poly_svm_time) losses['rbf-svm'].append(rbf_svm_loss) accs['rbf-svm'].append(rbf_svm_acc) runtimes['rbf-svm'].append(rbf_svm_time) else: raise ValueError('unknown method: {}'.format(method)) print() print('#' * 72) if method == 'lrfc': print_metrics(losses['logit'], accs['logit'], runtimes['logit'], 'Logistic regression') print_metrics(losses['rfc'], accs['rfc'], runtimes['rfc'], 'Random forest') elif method == 'svm': print_metrics(losses['linear-svm'], accs['linear-svm'], runtimes['linear-svm'], 'Linear SVM') if not skip_nonlinear_svm: print_metrics(losses['poly-svm'], accs['poly-svm'], runtimes['poly-svm'], 'Polynomial SVM') print_metrics(losses['rbf-svm'], accs['rbf-svm'], runtimes['rbf-svm'], 'RBF SVM') else: raise ValueError('unknown method: {}'.format(method)) print('#' * 72)
def main(args): k = 10 # ten partitions for k-fold cross-validation try: id_string = args[1] except: id_string = 'dummy' eprint('Using default id_string = \'{}\''.format(id_string)) try: data_fn = args[2] except: data_fn = 'dummy.txt' eprint('Using default data_fn = \'{}\''.format(data_fn)) try: interpret_model = args[3].lower() == 'true' or args[3].lower()[0] == 't' except: interpret_model = True eprint('Using default interpret_model = {}'.format(interpret_model)) try: prop_missing = float(args[4]) except: prop_missing = 0.0 eprint('Using default prop_missing = {}'.format(prop_missing)) data_path = '{}/{}'.format(DATA_DIR, data_fn) icd9_descript_path = '{}/{}'.format(DATA_DIR, 'phewas_codes.txt') model_module = models.deep_mlp model_id = model_module.__name__.split('.')[2] data_name = ''.join(data_fn.split('.')[:-1]) if not os.path.exists('out'): os.makedirs('out') if not os.path.exists('out/more'): os.makedirs('out/more') out_directory = 'out/more/{}_{}_{}'.format('riddle', data_fn, prop_missing) if not os.path.exists(out_directory): os.makedirs(out_directory) start = time.time() # get common data icd9_descript_dict = emr.get_icd9_descript_dict(icd9_descript_path) X, y, idx_feat_dict, idx_class_dict = emr.get_data(path=data_path, icd9_descript_dict=icd9_descript_dict, prop_missing=prop_missing) # print/save value-sorted dictionary of classes and features class_mapping = sorted(idx_class_dict.items(), key=lambda key: key[0]) print('Class mapping:') print(class_mapping) print() with open(out_directory + '/class_mapping.txt', 'w+') as f: print(class_mapping, file=f) with open(out_directory + '/feature_mapping.txt', 'w+') as f: for idx, feat in idx_feat_dict.items(): f.write('{}\t{}\n'.format(idx, feat)) nb_features = len(idx_feat_dict) nb_classes = len(idx_class_dict) nb_cases = len(X) print('Data loaded in {:.5f} seconds'.format(time.time() - start)) # shuffle indices and save them perm_indices = np.random.permutation(nb_cases) pickle_object(perm_indices, out_directory + '/perm_indices.pkl') try: # try validating shuffled indices with open(data_path + '_perm_indices.pkl', 'r') as f: exp_perm_indices = pickle.load(f) assert np.all(perm_indices == exp_perm_indices) except: eprint('file not found ' + data_path + '_perm_indices.pkl') eprint('not doing perm_indices check') # load saved model parameters model_params_fn = '{}/{}_{}_{}_param.pkl'.format(CACHE_DIR, 'riddle', data_fn, prop_missing) try: with open(model_params_fn, 'r') as f: model_params = pickle.load(f) except: eprint('Need to do parameter search!') eprint('Please run `parameter_search.py` with the relevant' + 'command line arguments') raise # run pipeline and get metric results (losses, accs, runtimes), (list_contrib_sums_D, list_contrib_sums_D2, list_contrib_sums), pairs = \ kfold_run_pipeline(model_module, model_params, X, y, nb_features=nb_features, nb_classes=nb_classes, k=k, perm_indices=perm_indices, interpret_model=interpret_model, out_directory=out_directory, id_string=id_string) if interpret_model: total_contrib_sums_D = compute_total_sums(list_contrib_sums_D) total_contrib_sums_D2 = compute_total_sums(list_contrib_sums_D2) total_contrib_sums = compute_total_sums(list_contrib_sums) nb_pairs = len(pairs) # get descriptions of feature importance feat_importance_summary = feature_importance.summarize_feature_importance( total_contrib_sums_D, total_contrib_sums_D2, idx_feat_dict=idx_feat_dict, idx_class_dict=idx_class_dict, icd9_descript_dict=icd9_descript_dict, pairs=pairs, nb_cases=nb_cases) # get frequencies of features per class feat_class_freq_table = frequency.get_frequency_table(X, y, idx_feat_dict=idx_feat_dict, idx_class_dict=idx_class_dict) # get orderings ordering_summary = ordering.summarize_orderings(total_contrib_sums, feat_class_freq_table, idx_feat_dict=idx_feat_dict, idx_class_dict=idx_class_dict, icd9_descript_dict=icd9_descript_dict, nb_pairs=nb_pairs) ordering_summary.save_individual_tables(idx_class_dict, out_directory) ordering_summary.save(out_directory) # print metrics in a pretty fashion print_metrics(losses, accs, runtimes, id_string=id_string) print('This k-fold multipipeline run script took {:.4f} seconds' .format(time.time() - start))
def run(data_fn, method='lrfc', which_half='both', prop_missing=0.0, k=10, skip_nonlinear_svm=False, nb_searches=20): data_path = '{}/{}'.format(DATA_DIR, data_fn) def get_results_dir(method, k_idx): base_folder = 'out/more/{}_{}_{}'.format(method, data_fn, prop_missing) folder = '{}/{}_idx_partition'.format(base_folder, k_idx) if not os.path.exists('out'): os.makedirs('out') if not os.path.exists('out/more'): os.makedirs('out/models') if not os.path.exists(base_folder): os.makedirs(base_folder) if not os.path.exists(folder): os.makedirs(folder) return folder try: # load saved parameters get_param_fn = lambda x: '{}/{}_{}_{}_param.pkl'.format(CACHE_DIR, x, data_fn, prop_missing) if method == 'lrfc': with open(get_param_fn('logit'), 'r') as f: logit_params = pickle.load(f) with open(get_param_fn('rfc'), 'r') as f: rfc_params = pickle.load(f) elif method == 'svm': with open(get_param_fn('linear-svm'), 'r') as f: linear_svm_params = pickle.load(f) if not skip_nonlinear_svm: with open(get_param_fn('poly-svm'), 'r') as f: poly_svm_params = pickle.load(f) with open(get_param_fn('rbf-svm'), 'r') as f: rbf_svm_params = pickle.load(f) else: raise ValueError('unknown method: {}'.format(method)) except: eprint('Need to do parameter search!') eprint('Please run `parameter_search.py` with the relevant' + 'command line arguments') raise X, y, perm_indices, nb_features, nb_classes = get_base_data(data_path, prop_missing) losses = {'logit':[], 'rfc':[], 'linear-svm':[], 'poly-svm':[], 'rbf-svm':[]} accs = {'logit':[], 'rfc':[], 'linear-svm':[], 'poly-svm':[], 'rbf-svm':[]} runtimes = {'logit':[], 'rfc':[], 'linear-svm':[], 'poly-svm':[], 'rbf-svm':[]} if which_half == 'first': loop_seq = range(0, k / 2) elif which_half == 'last': loop_seq = range(k / 2, k) elif which_half == 'both': loop_seq = range(0, k) else: raise ValueError('`which_half` must be \'first\', \'last\' or \'both\'') for k_idx in loop_seq: print('-' * 72) print('Partition k = {}'.format(k_idx)) data_partition_dict = emr.get_k_fold_partition(X, y, k_idx=k_idx, k=k, perm_indices=perm_indices) X_train = data_partition_dict['X_train'] y_train = data_partition_dict['y_train'] X_val = data_partition_dict['X_val'] y_val = data_partition_dict['y_val'] X_test = data_partition_dict['X_test'] y_test = data_partition_dict['y_test'] selected_feat_indices = select_feats(X_train + X_val, y_train + y_val, nb_features=nb_features) X_train, y_train = preproc_for_sklearn(X_train, y_train, nb_features) X_test, y_test = preproc_for_sklearn(X_test, y_test, nb_features) old_nb_features = len(X_train[0]) X_train = X_train[:, selected_feat_indices] X_test = X_test[:, selected_feat_indices] nb_features = len(X_train[0]) # extraneous but for future utility print('Reduced features from {} to {}'.format(old_nb_features, nb_features)) if method == 'lrfc': from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier # logistic regression start = time.time() logit = LogisticRegression(multi_class='multinomial', solver='lbfgs', **logit_params[k_idx]) logit.fit(X_train, y_train) logit_acc = accuracy_score(y_test, logit.predict(X_test)) logit_y_test_probas = logit.predict_proba(X_test) logit_loss = log_loss(y_test, logit_y_test_probas) logit_time = time.time() - start print('Logistic regression / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s' .format(logit_loss, logit_acc, logit_time)) # random forest classifier start = time.time() rfc = RandomForestClassifier(**rfc_params[k_idx]) rfc.fit(X_train, y_train) rfc_acc = accuracy_score(y_test, rfc.predict(X_test)) rfc_y_test_probas = rfc.predict_proba(X_test) rfc_loss = log_loss(y_test, rfc_y_test_probas) rfc_time = time.time() - start print('Random forest / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s' .format(rfc_loss, rfc_acc, rfc_time)) save_test_results(logit_y_test_probas, y_test, '{}/test_results.txt'.format(get_results_dir('logit', k_idx))) save_test_results(rfc_y_test_probas, y_test, '{}/test_results.txt'.format(get_results_dir('rfc', k_idx))) # joblib.dump(logit, get_results_dir('logit', k_idx) + '/clf.pkl') # joblib.dump(rfc, get_results_dir('rfc', k_idx) + '/clf.pkl') losses['logit'].append(logit_loss) accs['logit'].append(logit_acc) runtimes['logit'].append(logit_time) losses['rfc'].append(rfc_loss) accs['rfc'].append(rfc_acc) runtimes['rfc'].append(rfc_time) elif method == 'svm': from sklearn.svm import SVC # linear SVM start = time.time() linear_svm = SVC(kernel='linear', probability=True, **linear_svm_params[k_idx]) linear_svm.fit(X_train, y_train) linear_svm_acc = accuracy_score(y_test, linear_svm.predict(X_test)) linear_svm_y_test_probas = linear_svm.predict_proba(X_test) linear_svm_loss = log_loss(y_test, linear_svm_y_test_probas) linear_svm_time = time.time() - start print('Linear SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s' .format(linear_svm_acc, linear_svm_loss, linear_svm_time)) save_test_results(linear_svm_y_test_probas, y_test, '{}/test_results.txt'.format(get_results_dir('linear-svm', k_idx))) # joblib.dump(linear_svm, get_results_dir('linear-svm', k_idx) + '/clf.pkl') losses['linear-svm'].append(linear_svm_loss) accs['linear-svm'].append(linear_svm_acc) runtimes['linear-svm'].append(linear_svm_time) if skip_nonlinear_svm: continue # skip # polynomial SVM start = time.time() poly_svm = SVC(kernel='poly', probability=True, **poly_svm_params[k_idx]) poly_svm.fit(X_train, y_train) poly_svm_acc = accuracy_score(y_test, poly_svm.predict(X_test)) poly_svm_y_test_probas = poly_svm.predict_proba(X_test) poly_svm_loss = log_loss(y_test, poly_svm_y_test_probas) poly_svm_time = time.time() - start print('Polynomial SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s' .format(poly_svm_acc, poly_svm_loss, poly_svm_time)) # RBF SVM start = time.time() rbf_svm = SVC(kernel='rbf', probability=True, **rbf_svm_params[k_idx]) rbf_svm.fit(X_train, y_train) rbf_svm_acc = accuracy_score(y_test, rbf_svm.predict(X_test)) rbf_svm_y_test_probas = rbf_svm.predict_proba(X_test) rbf_svm_loss = log_loss(y_test, rbf_svm_y_test_probas) rbf_svm_time = time.time() - start print('RBF SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s' .format(rbf_svm_acc, rbf_svm_loss, rbf_svm_time)) save_test_results(poly_svm_y_test_probas, y_test, '{}/test_results.txt'.format(get_results_dir('poly-svm', k_idx))) save_test_results(rbf_svm_y_test_probas, y_test, '{}/test_results.txt'.format(get_results_dir('rbf-svm', k_idx))) # joblib.dump(poly_svm, get_results_dir('poly-svm', k_idx) + '/clf.pkl') # joblib.dump(rbf_svm, get_results_dir('rbf-svm', k_idx) + '/clf.pkl') losses['poly-svm'].append(poly_svm_loss) accs['poly-svm'].append(poly_svm_acc) runtimes['poly-svm'].append(poly_svm_time) losses['rbf-svm'].append(rbf_svm_loss) accs['rbf-svm'].append(rbf_svm_acc) runtimes['rbf-svm'].append(rbf_svm_time) else: raise ValueError('unknown method: {}'.format(method)) print() print('#' * 72) if method == 'lrfc': print_metrics(losses['logit'], accs['logit'], runtimes['logit'], 'Logistic regression') print_metrics(losses['rfc'], accs['rfc'], runtimes['rfc'], 'Random forest') elif method == 'svm': print_metrics(losses['linear-svm'], accs['linear-svm'], runtimes['linear-svm'], 'Linear SVM') if not skip_nonlinear_svm: print_metrics(losses['poly-svm'], accs['poly-svm'], runtimes['poly-svm'], 'Polynomial SVM') print_metrics(losses['rbf-svm'], accs['rbf-svm'], runtimes['rbf-svm'], 'RBF SVM') else: raise ValueError('unknown method: {}'.format(method)) print('#' * 72)
def main(args): k = 10 # ten partitions for k-fold cross-validation try: id_string = args[1] except: id_string = 'dummy' eprint('Using default id_string = \'{}\''.format(id_string)) try: data_fn = args[2] except: data_fn = 'dummy.txt' eprint('Using default data_fn = \'{}\''.format(data_fn)) try: interpret_model = args[3].lower() == 'true' or args[3].lower( )[0] == 't' except: interpret_model = True eprint('Using default interpret_model = {}'.format(interpret_model)) try: prop_missing = float(args[4]) except: prop_missing = 0.0 eprint('Using default prop_missing = {}'.format(prop_missing)) data_path = '{}/{}'.format(DATA_DIR, data_fn) icd9_descript_path = '{}/{}'.format(DATA_DIR, 'phewas_codes.txt') model_module = models.deep_mlp model_id = model_module.__name__.split('.')[2] data_name = ''.join(data_fn.split('.')[:-1]) if not os.path.exists('out'): os.makedirs('out') if not os.path.exists('out/more'): os.makedirs('out/more') out_directory = 'out/more/{}_{}_{}'.format('riddle', data_fn, prop_missing) if not os.path.exists(out_directory): os.makedirs(out_directory) start = time.time() # get common data icd9_descript_dict = emr.get_icd9_descript_dict(icd9_descript_path) X, y, idx_feat_dict, idx_class_dict = emr.get_data( path=data_path, icd9_descript_dict=icd9_descript_dict, prop_missing=prop_missing) # print/save value-sorted dictionary of classes and features class_mapping = sorted(idx_class_dict.items(), key=lambda key: key[0]) print('Class mapping:') print(class_mapping) print() with open(out_directory + '/class_mapping.txt', 'w+') as f: print(class_mapping, file=f) with open(out_directory + '/feature_mapping.txt', 'w+') as f: for idx, feat in idx_feat_dict.items(): f.write('{}\t{}\n'.format(idx, feat)) nb_features = len(idx_feat_dict) nb_classes = len(idx_class_dict) nb_cases = len(X) print('Data loaded in {:.5f} seconds'.format(time.time() - start)) # shuffle indices and save them perm_indices = np.random.permutation(nb_cases) pickle_object(perm_indices, out_directory + '/perm_indices.pkl') try: # try validating shuffled indices with open(data_path + '_perm_indices.pkl', 'r') as f: exp_perm_indices = pickle.load(f) assert np.all(perm_indices == exp_perm_indices) except: eprint('file not found ' + data_path + '_perm_indices.pkl') eprint('not doing perm_indices check') # load saved model parameters model_params_fn = '{}/{}_{}_{}_param.pkl'.format(CACHE_DIR, 'riddle', data_fn, prop_missing) try: with open(model_params_fn, 'r') as f: model_params = pickle.load(f) except: eprint('Need to do parameter search!') eprint('Please run `parameter_search.py` with the relevant' + 'command line arguments') raise # run pipeline and get metric results (losses, accs, runtimes), (list_contrib_sums_D, list_contrib_sums_D2, list_contrib_sums), pairs = \ kfold_run_pipeline(model_module, model_params, X, y, nb_features=nb_features, nb_classes=nb_classes, k=k, perm_indices=perm_indices, interpret_model=interpret_model, out_directory=out_directory, id_string=id_string) if interpret_model: total_contrib_sums_D = compute_total_sums(list_contrib_sums_D) total_contrib_sums_D2 = compute_total_sums(list_contrib_sums_D2) total_contrib_sums = compute_total_sums(list_contrib_sums) nb_pairs = len(pairs) # get descriptions of feature importance feat_importance_summary = feature_importance.summarize_feature_importance( total_contrib_sums_D, total_contrib_sums_D2, idx_feat_dict=idx_feat_dict, idx_class_dict=idx_class_dict, icd9_descript_dict=icd9_descript_dict, pairs=pairs, nb_cases=nb_cases) # get frequencies of features per class feat_class_freq_table = frequency.get_frequency_table( X, y, idx_feat_dict=idx_feat_dict, idx_class_dict=idx_class_dict) # get orderings ordering_summary = ordering.summarize_orderings( total_contrib_sums, feat_class_freq_table, idx_feat_dict=idx_feat_dict, idx_class_dict=idx_class_dict, icd9_descript_dict=icd9_descript_dict, nb_pairs=nb_pairs) ordering_summary.save_individual_tables(idx_class_dict, out_directory) ordering_summary.save(out_directory) # print metrics in a pretty fashion print_metrics(losses, accs, runtimes, id_string=id_string) print('This k-fold multipipeline run script took {:.4f} seconds'.format( time.time() - start))
def run(data_fn, method='lrfc', prop_missing=0.0, k=10, skip_nonlinear_svm=False, nb_searches=20, max_nb_samples=10000): if 'dummy' in data_fn or 'debug' in data_fn: nb_searches = 3 data_path = '{}/{}'.format(DATA_DIR, data_fn) if not FORCE_RUN: # check if already did param search, if so, skip did = lambda x: already_done(x, data_fn, prop_missing) # helper if method == 'riddle' and did(['riddle']): eprint('Already did parameter search for riddle') return elif method == 'lrfc' and did(['logit', 'rfc']): eprint('Already did parameter search for lrfc') return elif method == 'svm' and did(['linear-svm', 'poly-svm', 'rbf-svm']): eprint('Already did parameter search for svm') return params = { 'riddle': {}, 'logit': {}, 'rfc': {}, 'linear-svm': {}, 'poly-svm': {}, 'rbf-svm': {} } X, y, perm_indices, nb_features, nb_classes = get_base_data( data_path, prop_missing) for k_idx in range(0, k): print('-' * 72) print('Partition k = {}'.format(k_idx)) data_partition_dict = emr.get_k_fold_partition( X, y, k_idx=k_idx, k=k, perm_indices=perm_indices) X_train = data_partition_dict['X_train'] y_train = data_partition_dict['y_train'] X_val = data_partition_dict['X_val'] y_val = data_partition_dict['y_val'] # cap number of validation samples if max_nb_samples != None and len(X_val) > max_nb_samples: X_val = X_val[0:max_nb_samples] y_val = y_val[0:max_nb_samples] if method != 'riddle': selected_feat_indices = select_feats(X_train + X_val, y_train + y_val, nb_features=nb_features) X_val, y_val = preproc_for_sklearn(X_val, y_val, nb_features=nb_features) X_val = X_val[:, selected_feat_indices] if method == 'riddle': start = time.time() model_module = models.deep_mlp riddle_param_dist = { 'learning_rate': UniformLogSpace(10, lo=-6, hi=-1) } params['riddle'][k_idx] = parameter_tuning.random_search( model_module, riddle_param_dist, X_val, y_val, nb_features=nb_features, nb_classes=nb_classes, k=3, process_X_data_func_args={'nb_features': nb_features}, process_y_data_func_args={'nb_classes': nb_classes}, nb_searches=nb_searches) print('Best parameters for RIDDLE: {} found in {:.3f} s'.format( params['riddle'][k_idx], time.time() - start)) elif method == 'lrfc': # logistic regression start = time.time() logit_param_dist = {'C': UniformLogSpace()} logit_estimator = LogisticRegression(multi_class='multinomial', solver='lbfgs') params['logit'][k_idx] = parameter_search( X_val, y_val, estimator=logit_estimator, search=RandomizedSearchCV, dist_or_grid=logit_param_dist, n_iter=nb_searches, scoring=loss_scorer) print( 'Best parameters for logistic regression: {} found in {:.3f} s' .format(params['logit'][k_idx], time.time() - start)) # random forest classifier start = time.time() rfc_param_dist = {'max_features': ['sqrt', 'log2'], \ 'max_depth': UniformLogSpace(base=2, lo=2, hi=9)} rfc_estimator = RandomForestClassifier() params['rfc'][k_idx] = parameter_search( X_val, y_val, estimator=rfc_estimator, search=RandomizedSearchCV, dist_or_grid=rfc_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for random forest: {} found in {:.3f} s'. format(params['rfc'][k_idx], time.time() - start)) elif method == 'svm': # linear SVM start = time.time() linear_svm_param_dist = {'C': UniformLogSpace()} linear_svm_estimator = SVC(kernel='linear', probability=True) params['linear-svm'][k_idx] = parameter_search( X_val, y_val, estimator=linear_svm_estimator, search=RandomizedSearchCV, dist_or_grid=linear_svm_param_dist, n_iter=nb_searches, scoring=loss_scorer) print( 'Best parameters for linear SVM: {} found in {:.3f} s'.format( params['linear-svm'][k_idx], time.time() - start)) if skip_nonlinear_svm: continue # skip nonlinear_svm_param_dist = { 'C': UniformLogSpace(), 'gamma': UniformLogSpace(base=10, lo=-5, hi=1) } # polynomial SVM start = time.time() poly_svm_estimator = SVC(kernel='poly', probability=True) params['poly-svm'][k_idx] = parameter_search( X_val, y_val, estimator=poly_svm_estimator, search=RandomizedSearchCV, dist_or_grid=nonlinear_svm_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for polynomial SVM: {} found in {:.3f} s'. format(params['poly-svm'][k_idx], time.time() - start)) # RBF SVM start = time.time() rbf_svm_estimator = SVC(kernel='rbf', probability=True) params['rbf-svm'][k_idx] = parameter_search( X_val, y_val, estimator=rbf_svm_estimator, search=RandomizedSearchCV, dist_or_grid=nonlinear_svm_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for RBF SVM: {} found in {:.3f} s'.format( params['rbf-svm'][k_idx], time.time() - start)) else: raise ValueError('unknown method: {}'.format(method)) # save for method_name, sub_param_dict in params.items(): if len(sub_param_dict) > 0: pickle_object( sub_param_dict, '{}/{}_{}_{}_param.pkl'.format(CACHE_DIR, method_name, data_fn, prop_missing)) print('Finished parameter search for method: {}'.format(method))
def run(data_fn, method='lrfc', prop_missing=0.0, k=10, skip_nonlinear_svm=False, nb_searches=20, max_nb_samples=10000): if 'dummy' in data_fn or 'debug' in data_fn: nb_searches = 3 data_path = '{}/{}'.format(DATA_DIR, data_fn) if not FORCE_RUN: # check if already did param search, if so, skip did = lambda x: already_done(x, data_fn, prop_missing) # helper if method == 'riddle' and did(['riddle']): eprint('Already did parameter search for riddle') return elif method == 'lrfc' and did(['logit', 'rfc']): eprint('Already did parameter search for lrfc') return elif method == 'svm' and did(['linear-svm', 'poly-svm', 'rbf-svm']): eprint('Already did parameter search for svm') return params = {'riddle': {}, 'logit': {}, 'rfc': {}, 'linear-svm': {}, 'poly-svm': {}, 'rbf-svm': {}} X, y, perm_indices, nb_features, nb_classes = get_base_data(data_path, prop_missing) for k_idx in range(0, k): print('-' * 72) print('Partition k = {}'.format(k_idx)) data_partition_dict = emr.get_k_fold_partition(X, y, k_idx=k_idx, k=k, perm_indices=perm_indices) X_train = data_partition_dict['X_train'] y_train = data_partition_dict['y_train'] X_val = data_partition_dict['X_val'] y_val = data_partition_dict['y_val'] # cap number of validation samples if max_nb_samples != None and len(X_val)> max_nb_samples: X_val = X_val[0:max_nb_samples] y_val = y_val[0:max_nb_samples] if method != 'riddle': selected_feat_indices = select_feats(X_train + X_val, y_train + y_val, nb_features=nb_features) X_val, y_val = preproc_for_sklearn(X_val, y_val, nb_features=nb_features) X_val = X_val[:, selected_feat_indices] if method == 'riddle': start = time.time() model_module = models.deep_mlp riddle_param_dist = {'learning_rate': UniformLogSpace(10, lo=-6, hi=-1)} params['riddle'][k_idx] = parameter_tuning.random_search(model_module, riddle_param_dist, X_val, y_val, nb_features=nb_features, nb_classes=nb_classes, k=3, process_X_data_func_args={'nb_features': nb_features}, process_y_data_func_args={'nb_classes': nb_classes}, nb_searches=nb_searches) print('Best parameters for RIDDLE: {} found in {:.3f} s' .format(params['riddle'][k_idx], time.time() - start)) elif method == 'lrfc': # logistic regression start = time.time() logit_param_dist = {'C': UniformLogSpace()} logit_estimator = LogisticRegression(multi_class='multinomial', solver='lbfgs') params['logit'][k_idx] = parameter_search(X_val, y_val, estimator=logit_estimator, search=RandomizedSearchCV, dist_or_grid=logit_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for logistic regression: {} found in {:.3f} s' .format(params['logit'][k_idx], time.time() - start)) # random forest classifier start = time.time() rfc_param_dist = {'max_features': ['sqrt', 'log2'], \ 'max_depth': UniformLogSpace(base=2, lo=2, hi=9)} rfc_estimator = RandomForestClassifier() params['rfc'][k_idx] = parameter_search(X_val, y_val, estimator=rfc_estimator, search=RandomizedSearchCV, dist_or_grid=rfc_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for random forest: {} found in {:.3f} s' .format(params['rfc'][k_idx], time.time() - start)) elif method == 'svm': # linear SVM start = time.time() linear_svm_param_dist = {'C': UniformLogSpace()} linear_svm_estimator = SVC(kernel='linear', probability=True) params['linear-svm'][k_idx] = parameter_search(X_val, y_val, estimator=linear_svm_estimator, search=RandomizedSearchCV, dist_or_grid=linear_svm_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for linear SVM: {} found in {:.3f} s' .format(params['linear-svm'][k_idx], time.time() - start)) if skip_nonlinear_svm: continue # skip nonlinear_svm_param_dist = {'C': UniformLogSpace(), 'gamma': UniformLogSpace(base=10, lo=-5, hi=1)} # polynomial SVM start = time.time() poly_svm_estimator = SVC(kernel='poly', probability=True) params['poly-svm'][k_idx] = parameter_search(X_val, y_val, estimator=poly_svm_estimator, search=RandomizedSearchCV, dist_or_grid=nonlinear_svm_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for polynomial SVM: {} found in {:.3f} s' .format(params['poly-svm'][k_idx], time.time() - start)) # RBF SVM start = time.time() rbf_svm_estimator = SVC(kernel='rbf', probability=True) params['rbf-svm'][k_idx] = parameter_search(X_val, y_val, estimator=rbf_svm_estimator, search=RandomizedSearchCV, dist_or_grid=nonlinear_svm_param_dist, n_iter=nb_searches, scoring=loss_scorer) print('Best parameters for RBF SVM: {} found in {:.3f} s' .format(params['rbf-svm'][k_idx], time.time() - start)) else: raise ValueError('unknown method: {}'.format(method)) # save for method_name, sub_param_dict in params.items(): if len(sub_param_dict) > 0: pickle_object(sub_param_dict, '{}/{}_{}_{}_param.pkl'.format( CACHE_DIR, method_name, data_fn, prop_missing)) print('Finished parameter search for method: {}'.format(method))