def fit_cv(X, y, nepochs=100, batch_size=10, val_pct=0.1, verbose=False, lr=3e-4, weight_decay=0.01, model_type='nonlinear', nfolds=5): folds = create_folds(X, nfolds) models = [] for fold_idx, fold in enumerate(folds): print('Fitting model {}'.format(fold_idx)) mask = np.ones(X.shape[0], dtype=bool) mask[fold] = False models.append( fit_nn(X[mask], y[mask], nepochs=nepochs, batch_size=batch_size, val_pct=val_pct, verbose=verbose, lr=lr, weight_decay=weight_decay, model_type=model_type)) return NeuralCvModel(models, folds)
def load_plx4720(verbose=False, feature_type="mutation"): drug_target = "PLX4720" if verbose: print("Loading data") X_drugs, y_drugs, drugs, cells, features = load_ccle( feature_type=feature_type, drug_target=drug_target, normalize=True ) drug_idx = drugs.get_loc(drug_target) if verbose: print("Drug {}".format(drugs[drug_idx])) X_drug, y_drug = X_drugs[drug_idx], y_drugs[drug_idx] ######## Specific to PLX4720. Filters out all features with pearson correlation less than 0.1 in magnitude ######## ccle_expected_features = [ "C11orf85", "FXYD4", "SLC28A2", "MAML3_MUT", "RAD51L1_MUT", "GAPDHS", "BRAF_MUT", ] if verbose: print("Filtering by correlation with signal first") ccle_selected, corrs = ccle_feature_filter(X_drug, y_drug) for plx4720_feat in [f for f in ccle_expected_features if f in features]: idx = features.get_loc(plx4720_feat) ccle_selected[idx] = True if verbose: print("Correlation for {}: {:.4f}".format(plx4720_feat, corrs[idx])) ccle_features = features[ccle_selected] # Split the data into 10 folds where each fold contains at least 1 observation of each drug nfolds = 10 drug_folds = [create_folds(x_i, nfolds) for x_i in X_drugs] X_drug, y_drug, folds = X_drugs[drug_idx], y_drugs[drug_idx], drug_folds[drug_idx] # Load or fit the model MODEL_PATH = "data/model.pt" if os.path.exists(MODEL_PATH): elastic_model = torch.load(MODEL_PATH) else: elastic_model = CvModel( fit_cv(X_drug, y_drug, folds, fit_elastic_net_ccle, selected=ccle_selected), folds, "Elastic Net", selected=ccle_selected, ) torch.save(elastic_model, MODEL_PATH) # Plot the fit to show it's pretty good # plot_ccle_predictions(elastic_model, X_drug, y_drug) # Show the features selected by the heuristic if we fit this way (may be slightly different than in the paper) if verbose: print_top_features(elastic_model, ccle_expected_features) return X_drug, y_drug, features, ccle_features, elastic_model
def load_or_fit_model(descriptor, X, Y): nfolds = 10 y = Y[descriptor] y = y[y.notnull()] x = X.loc[y.index].values y = y.values model_path = 'data/{}.pt'.format(descriptor) if os.path.exists(model_path): forest_model = joblib.load(model_path) else: print('Fitting {}'.format(descriptor)) folds = create_folds(x, nfolds) if descriptor == 'Intensity': forest_model = CvModel(fit_cv(x, y, folds, fit_extratrees), folds, 'ExtraTrees') else: forest_model = CvModel(fit_cv(x, y, folds, fit_forest), folds, 'RandomForest') plot_predictions(forest_model, x, y, descriptor) joblib.dump(forest_model, model_path) return x, y, forest_model
def calibrate_discrete( X, feature, X_test=None, nquantiles=101, nbootstraps=100, nfolds=5, tv_threshold=0.005, p_threshold=0.0, use_cv=False, ): """Calibrates a bootstrap confidence interval conditional model for a given feature.""" classes = np.unique(X[:, feature]) nclasses = len(classes) # Search over a linear quantile grid to search quantile_range = np.linspace(0, 100, nquantiles) jmask = np.ones(X.shape[1], dtype=bool) jmask[feature] = False if X_test is None and use_cv: # Use k-fold cross-validation to generate conditional probability estimates for X_j print("Fitting using {} bootstrap resamples and {} folds".format( nbootstraps, nfolds)) probs = np.zeros((nquantiles, X.shape[0], nclasses)) proposals = [] folds = create_folds(X, nfolds) for fold_idx, fold in enumerate(folds): print(fold_idx) imask = np.ones(X.shape[0], dtype=bool) imask[fold] = False model = DiscreteBootstrapConditionalModel( X[imask][:, jmask], X[imask][:, feature], fit_classifier, nbootstraps=nbootstraps, ) # probs[:,fold] = model.pmf_quantiles(X[fold][:,jmask], X[fold][:,feature], quantile_range, axis=0) for c in classes: probs[:, fold, c] = model.pmf_quantiles(X[fold][:, jmask], c, quantile_range, axis=0) proposals.append(model) # sampler = lambda l, u: sample_cv(X[:,jmask], proposals, folds, l, u) sampler = CrossValidationSampler(X[:, jmask], proposals, folds) outcomes = np.array([(X[:, feature] == c).mean() for c in classes]) else: if X_test is None: print("Using training set as testing set.") X_test = X # Use a held-out test set print( "Fitting using {} bootstrap resamples and a {}/{} train/test split" .format(nbootstraps, X.shape[0], X_test.shape[0])) model = DiscreteBootstrapConditionalModel(X[:, jmask], X[:, feature], fit_classifier, nbootstraps=nbootstraps) probs = np.zeros((nquantiles, X_test.shape[0], nclasses)) for cidx, c in enumerate(classes): probs[:, :, cidx] = model.pmf_quantiles(X_test[:, jmask], c, quantile_range, axis=0) # sampler = lambda l, u: sample_holdout(X_test[:,jmask], model, l, u) sampler = HoldoutSampler(X_test[:, jmask], model) outcomes = np.array([(X_test[:, feature] == c).mean() for c in classes]) # Find the lower quantile that forms a sufficient lower bound on the observed probabilities for i in range(1, nquantiles // 2): lower = quantile_range[nquantiles // 2 - i] qlower = probs[nquantiles // 2 - i] tv_lower = (qlower.mean(axis=0) - outcomes).clip(0, np.inf).sum() tv_pvalue = tv_test(tv_lower, outcomes, probs.shape[1]) # print('Lower: {} TV: {} p: {}'.format(lower, tv_lower, tv_pvalue)) # Allow some error tolerance due to noise/finite data if tv_lower <= tv_threshold or tv_pvalue <= p_threshold: break # Find the upper quantile for i in range(1, nquantiles // 2): upper = quantile_range[nquantiles // 2 + i] qupper = probs[nquantiles // 2 + i] tv_upper = (outcomes - qupper.mean(axis=0)).clip(0, np.inf).sum() tv_pvalue = tv_test(tv_upper, outcomes, probs.shape[1]) # print('Upper: {} TV: {} p: {}'.format(upper, tv_upper, tv_pvalue)) # Allow some error tolerance due to noise/finite data if tv_upper <= tv_threshold or tv_pvalue <= p_threshold: break # Our TV-distance is the worst-case of the two bounds tv_stat = np.max([tv_lower, tv_upper]) sampler.quantiles = np.array([lower, upper]) # TODO: how do we get a p-value estimate here? no clear notion of null... print("Selected intervals: [{},{}]".format(lower, upper)) return { "model": model, "probs": probs, "tv_stat": tv_stat, "upper": upper, "lower": lower, "qupper": qupper, "qlower": qlower, "quantiles": quantile_range, "sampler": sampler, }
def calibrate_continuous(X, feature, X_test=None, nquantiles=101, nbootstraps=100, nfolds=5, ks_threshold=0.005, p_threshold=0., use_cv=False): '''Calibrates a bootstrap confidence interval conditional model for a given feature.''' # Search over a linear quantile grid to search quantile_range = np.linspace(0, 100, nquantiles) jmask = np.ones(X.shape[1], dtype=bool) jmask[feature] = False if X_test is None and use_cv: # Use k-fold cross-validation to generate conditional density estimates for X_j print('Fitting using {} bootstrap resamples and {} folds'.format(nbootstraps, nfolds)) cdfs = np.zeros((nquantiles, X.shape[0])) proposals = [] folds = create_folds(X, nfolds) for fold_idx, fold in enumerate(folds): imask = np.ones(X.shape[0], dtype=bool) imask[fold] = False model = BootstrapConditionalModel(X[imask][:,jmask], X[imask][:,feature], fit_mdn, nbootstraps=nbootstraps) cdfs[:,fold] = model.cdf_quantiles(X[fold][:,jmask], X[fold][:,feature], quantile_range, axis=0) proposals.append(model) sampler = CrossValidationSampler(X[:,jmask], proposals, folds) else: if X_test is None: print('Using training set as testing set.') X_test = X # Use a held-out test set print('Fitting using {} bootstrap resamples and a {}/{} train/test split'.format(nbootstraps, X.shape[0], X_test.shape[0])) model = BootstrapConditionalModel(X[:,jmask], X[:,feature], fit_mdn, nbootstraps=nbootstraps) cdfs = model.cdf_quantiles(X_test[:,jmask], X_test[:,feature], quantile_range, axis=0) sampler = HoldoutSampler(X_test[:,jmask], model) # Look at the bounds of the CDF along a discrete grid of points ks_grid = np.linspace(1e-6,1-1e-6,1001) # Find the lower quantile that forms a sufficient upper bound on the uniform CDF for i in range(1,nquantiles//2): lower = quantile_range[nquantiles//2 - i] qlower = cdfs[nquantiles//2 - i] # U(0,1) CDF is the (0,1),(0,1) line. So at every point q on the grid of # CDF points, we expect a well-calibrated model to have q*N points with # CDF value lower than q. Here we are looking for an upper bound, so # we measure the KS distance as the maximum amount the U(0,1) CDF is # above the predicted CDF. ks_lower = 0 for ks_point in ks_grid: ks_lower = max(ks_lower, ks_point - (qlower <= ks_point).mean()) ks_pvalue = ks_test(ks_lower, cdfs.shape[1]) # print('Lower: {} KS: {} p: {}'.format(lower, ks_lower, ks_pvalue)) # Allow some error tolerance due to noise/finite data if ks_lower <= ks_threshold or ks_pvalue <= p_threshold: break # Find the upper quantile for i in range(1,nquantiles//2): upper = quantile_range[nquantiles//2+i] qupper = cdfs[nquantiles//2 + i] # U(0,1) CDF is the (0,1),(0,1) line. So at every point q on the grid of # CDF points, we expect a well-calibrated model to have q*N points with # CDF value lower than q. Here we are looking for a lower bound, so # we measure the KS distance as the maximum amount the U(0,1) CDF is # below the predicted CDF. ks_upper = 0 for ks_point in ks_grid: ks_upper = max(ks_upper, (qupper <= ks_point).mean() - ks_point) ks_pvalue = ks_test(ks_upper, cdfs.shape[1]) # print('Upper: {} KS: {} p: {}'.format(upper, ks_upper, ks_pvalue)) # Allow some error tolerance due to noise/finite data if ks_upper <= ks_threshold or ks_pvalue <= p_threshold: break # Set the sampler to the chosen regions sampler.quantiles = np.array([lower, upper]) # Our KS-distance is the worst-case of the two bounds ks_stat = np.max([ks_lower, ks_upper]) # The p-value on the KS test that the bounded distribution is different # from the Uniform distribution ks_pvalue = ks_test(ks_stat, cdfs.shape[1]) print('Selected intervals: [{},{}]'.format(lower, upper)) return {'model': model, 'cdfs': cdfs, 'ks_stat': ks_stat, 'ks_pvalue': ks_pvalue, 'upper': upper, 'lower': lower, 'qupper': qupper, 'qlower': qlower, 'quantiles': quantile_range, 'sampler': sampler }
def run(trial, feature, reset=False): N = 500 # total number of samples P = 500 # number of features S = 40 # number of signal features nperms = 5000 fdr_threshold = 0.1 nfolds = 5 X, y, truth = load_or_create_dataset(trial, N, P, S) np.random.seed(trial * P + feature) infos = [ ModelInfo(trial, "Partial Least Squares", fit_pls, "pls"), ModelInfo(trial, "Lasso", fit_lasso_cv, "lasso"), ModelInfo(trial, "Elastic Net", fit_elastic_net_cv, "enet"), ModelInfo(trial, "Bayesian Ridge", fit_bridge, "bridge"), ModelInfo(trial, "Polynomial Kernel Ridge", fit_kridge, "kridge"), ModelInfo(trial, "RBF Support Vector", fit_svr, "svr"), ModelInfo(trial, "Random Forest", fit_forest, "rf") # ModelInfo(trial, 'Extra Trees', fit_extratrees, 'xtrees') ] folds = get_model(infos[0], X, y, create_folds(X, nfolds), reset).folds models = [get_model(info, X, y, folds, reset) for info in infos] # Create the test statistic for each model # tstats = [(lambda X_target: ((y - model.predict(X_target))**2).mean()) for model in models] # Load the conditional model for this feature conditional = get_conditional(trial, feature) # Run the normal CVRT for the first model, but save the null samples to # avoid recomputing them for the rest of the models. info, model = infos[0], models[0] tstat = lambda X_target: ((y - model.predict(X_target))**2).mean() print("Running CVRT for {}".format(info.name)) results = hrt( feature, tstat, X, nperms=nperms, conditional=conditional, lower=conditional.quantiles[0], upper=conditional.quantiles[1], save_nulls=True, ) p_value = results["p_value"] print("p={}".format(p_value)) np.save("data/{}/{}_{}.npy".format(trial, info.prefix, feature), p_value) # Get the relevant values from the full CVRT on the first model t_true = results["t_stat"] X_nulls = results["samples_null"] quantile_nulls = results["quantiles_null"] # Run the CVRTs for the remaining models using the same null samples X_null = np.copy(X) for info, model in zip(infos[1:], models[1:]): print("Running cached CVRT for {}".format(info.name)) t_weights = np.full(nperms, np.nan) t_null = np.full(nperms, np.nan) tstat = lambda X_target: ((y - model.predict(X_target))**2).mean() t_true = tstat(X) for perm in range(nperms): if (perm % 500) == 0: print("Trial {}".format(perm)) # Get the test-statistic under the null X_null[:, feature] = X_nulls[perm] t_null[perm] = tstat(X_null) if t_null[perm] <= t_true: # Over-estimate the likelihood t_weights[perm] = quantile_nulls[perm, 1] else: # Under-estimate the likelihood t_weights[perm] = quantile_nulls[perm, 0] p_value = t_weights[t_null <= t_true].sum() / t_weights.sum() print("p={}".format(p_value)) np.save("data/{}/{}_{}.npy".format(trial, info.prefix, feature), p_value)