def predict(self, neighbor_idxs, Survival_train, Censored_train, Survival_test=None, Censored_test=None, K=15, Method='non-cumulative'): """ Predict testing set using 'prototype' (i.e. training) set using KNN neighbor_idxs - indices of nearest neighbors; (N_test, N_train) Survival_train - training sample time-to-event; (N,) np array Censored_train - training sample censorship status; (N,) np array K - number of nearest-neighbours to use, int Method - cumulative vs non-cumulative probability """ # Keep only desired K neighbor_idxs = neighbor_idxs[:, 0:K] # Initialize N_test = neighbor_idxs.shape[0] T_test = np.zeros([N_test]) if Method == 'non-cumulative': # Convert outcomes to "alive status" at each time point alive_train = sUtils.getAliveStatus(Survival_train, Censored_train) # Get survival prediction for each patient for idx in range(N_test): status = alive_train[neighbor_idxs[idx, :], :] totalKnown = np.sum(status >= 0, axis=0) status[status < 0] = 0 # remove timepoints where there are no known statuses status = status[:, totalKnown != 0] totalKnown = totalKnown[totalKnown != 0] # get "average" predicted survival time status = np.sum(status, axis=0) / totalKnown # now get overall time prediction T_test[idx] = np.sum(status) elif Method == 'cumulative': for idx in range(N_test): # Get at-risk groups for each time point for nearest neighbors T = Survival_train[neighbor_idxs[idx, :]] O = 1 - Censored_train[neighbor_idxs[idx, :]] T, O, at_risk, _ = sUtils.calc_at_risk(T, O) N_at_risk = K - at_risk # Calcuate cumulative probability of survival P = np.cumprod((N_at_risk - O) / N_at_risk) # now get overall time prediction T_test[idx] = np.sum(P) else: raise ValueError( "Method is either 'cumulative' or 'non-cumulative'.") # Get c-index #====================================================================== CI = 0 if Survival_test is not None: assert (Censored_test is not None) CI = sUtils.c_index(T_test, Survival_test, Censored_test, prediction_type='survival_time') return T_test, CI
def post_nca_bagging(self, X_test, X_train, Survival_train, Censored_train, Survival_test=None, Censored_test=None, min_n_feats=10, n_subspaces=20, K=30, Method="cumulative-time", norm=2): """ Get accuracy using bagged subspaces KNN approach following NCA and sorting features by absolute weight. Args: ------ X_test, X_train - training and testing set IMPORTANT: Must be NCA-transformed first and columns sorted by absolute feature weight n_subspaces - no of subspaces to use. min_n_feats - minimum no of features to use """ if Method == "cumulative-hazard": prediction_type = "risk" else: prediction_type = "survival_time" # sanity checks if n_subspaces > X_test.shape[1]: n_subspaces = X_test.shape[1] if min_n_feats > X_test.shape[1]: min_n_feats = X_test.shape[1]-1 # initialize preds = np.zeros([X_test.shape[0], n_subspaces-min_n_feats]) maxidxs = np.arange(min_n_feats, X_test.shape[1]) np.random.shuffle(maxidxs) maxidxs = maxidxs[0: n_subspaces-min_n_feats] for subspace, fidx_max in enumerate(maxidxs): #print('\t\tSubspace {} of {}'.format(subspace, n_subspaces-1)) # Get neighbor indices neighbor_idxs = self._get_neighbor_idxs(\ X_test[:, 0:fidx_max], X_train[:, 0:fidx_max], norm = norm) # Predict testing set t_test, _ = self.predict(neighbor_idxs, Survival_train, Censored_train, K=K, Method=Method) preds[:, subspace] = t_test # Aggregate prediction t_test = np.median(preds, axis=1) # Get Ci if survival data available Ci = 0 if Survival_test is not None: assert (Censored_test is not None) Ci = sUtils.c_index(t_test, Survival_test, Censored_test, prediction_type= prediction_type) return t_test, Ci
def predict(self, neighbor_idxs, Survival_train, Censored_train, Survival_test = None, Censored_test = None, K = 30, Method = "cumulative-time"): """ Predict testing set using 'prototype' (i.e. training) set using KNN neighbor_idxs - indices of nearest neighbors; (N_test, N_train) Survival_train - training sample time-to-event; (N,) np array Censored_train - training sample censorship status; (N,) np array K - number of nearest-neighbours to use, int """ # Keep only desired K neighbor_idxs = neighbor_idxs[:, 0:K] # Initialize N_test = neighbor_idxs.shape[0] T_test = np.zeros([N_test]) if Method == 'non-cumulative': # Convert outcomes to "alive status" at each time point alive_train = sUtils.getAliveStatus(Survival_train, Censored_train) # Get survival prediction for each patient for idx in range(N_test): status = alive_train[neighbor_idxs[idx, :], :] totalKnown = np.sum(status >= 0, axis = 0) status[status < 0] = 0 # remove timepoints where there are no known statuses # (i.e. after last neighbor dies or gets censored) status = status[:, totalKnown != 0] totalKnown = totalKnown[totalKnown != 0] # get "average" predicted survival time status = np.sum(status, axis = 0) / totalKnown # now get overall time prediction T_test[idx] = np.sum(status) elif Method in ['cumulative-time', 'cumulative-hazard']: # itirate through patients for idx in range(N_test): # Get time and censorship T = Survival_train[neighbor_idxs[idx, :]] C = Censored_train[neighbor_idxs[idx, :]] if C.min() == 1: # All cases are censored if Method == "cumulative-time": T_test[idx] = T.max() elif Method == "cumulative-hazard": T_test[idx] = 0 continue if Method == "cumulative-time": # Get km estimator t, f = self._km_estimator(T, C) # Get mean survival time T_test[idx] = np.sum(np.diff(t) * f[0:-1]) elif Method == 'cumulative-hazard': # Get NA estimator T = Survival_train[neighbor_idxs[idx, :]] C = Censored_train[neighbor_idxs[idx, :]] t, f = self._na_estimator(T, C) # Get integral under cum. hazard curve T_test[idx] = np.sum(np.diff(t) * f[0:-1]) else: raise ValueError("Method not implemented.") # Get c-index Ci = 0 if Method == "cumulative-hazard": prediction_type = "risk" else: prediction_type = "survival_time" if Survival_test is not None: assert (Censored_test is not None) Ci = sUtils.c_index(T_test, Survival_test, Censored_test, prediction_type= prediction_type) return T_test, Ci
def predict_with_bagging(self, X_test, X_train, Survival_train, Censored_train, Survival_test=None, Censored_test=None, n_bags=50, feats_per_bag=None, K=30, Method="cumulative-time", norm=2): """ Predict survival with random subspace bagging. """ if Method == "cumulative-hazard": prediction_type = "risk" else: prediction_type = "survival_time" # # sanity checks and defaults # assign_defaults = False if feats_per_bag is None: assign_defaults = True else: assert ("int" in str(type(feats_per_bag))) if feats_per_bag > X_test.shape[1]: assign_defaults = True if assign_defaults: feats_per_bag = np.int32(0.75 * X_test.shape[1]) # # initialize # preds = np.zeros([X_test.shape[0], n_bags]) # Doing all the shufling first since for some reason # np shuffle does not work insider the next loop! idxs = np.arange(X_train.shape[1]) idx_shuffles = [] for shuff in range(n_bags): np.random.shuffle(idxs) idx_shuffles.append(idxs.copy()[0:feats_per_bag]) # # predict using random subspaces # for bag, idxs in enumerate(idx_shuffles): # Get neighbor indices neighbor_idxs = self._get_neighbor_idxs(\ X_test[:, idxs], X_train[:, idxs], norm = norm) # Predict testing set t_test, _ = self.predict(neighbor_idxs, Survival_train, Censored_train, K=K, Method=Method) preds[:, bag] = t_test # Aggregate prediction t_test = np.median(preds, axis=1) # Get Ci if survival data available Ci = 0 if Survival_test is not None: assert (Censored_test is not None) Ci = sUtils.c_index(t_test, Survival_test, Censored_test, prediction_type= prediction_type) return t_test, Ci
foldidx_val = [ 'fold_{}_'.format(fold + 1) in j for j in val_files ].index(True) foldidx_test = [ 'fold_{}_'.format(fold + 1) in j for j in test_files ].index(True) preds_val = read_table(pred_path + val_files[foldidx_val], sep=' ') preds_test = read_table(pred_path + test_files[foldidx_test], sep=' ') # Get validation set accuracy ci_val = [] for hyperpars in range(preds_val.shape[1]): ci_val.append( sUtils.c_index(preds_val.values[:, hyperpars], Survival[splitIdxs['valid'][fold]], Censored[splitIdxs['valid'][fold]], prediction_type='risk')) # Get testing set accuracy for optimal hyperparams ci_test.append( sUtils.c_index(preds_test.values[:, np.argmax(ci_val)], Survival[splitIdxs['test'][fold]], Censored[splitIdxs['test'][fold]], prediction_type='risk')) # append summary stats ci_test.extend([np.median(ci_test), np.mean(ci_test), \ np.percentile(ci_test, 25), np.percentile(ci_test, 75), \ np.std(ci_test)]) # append to final results table