def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options): """ linreg using scikitlearn, using more standard regression models with penalization requiring nested-cross-validation """ if learn_options["weighted"] is not None and ( learn_options["penalty"] != "L2" or learn_options["method"] != "linreg" ): raise NotImplementedError("weighted prediction not implemented for any methods by L2 at the moment") cv, n_folds = set_up_folds(learn_options, y_all.iloc[train]) if learn_options["penalty"] == "L1": l1_ratio = [1.0] elif learn_options["penalty"] == "L2": l1_ratio = [0.0] elif learn_options["penalty"] == "EN": # elastic net l1_ratio = np.linspace(0.0, 1.0, 20) performance = np.zeros((len(learn_options["alpha"]), len(l1_ratio))) degenerate_pred = np.zeros((len(learn_options["alpha"]))) for train_inner, test_inner in cv: for i, alpha in enumerate(learn_options["alpha"]): for j, l1r in enumerate(l1_ratio): clf = train_linreg_model(alpha, l1r, learn_options, train_inner, X[train], y[train], y_all.iloc[train]) if learn_options["feature_select"]: clf, tmp_pred = feature_select(clf, learn_options, test_inner, train_inner, X[train], y[train]) else: tmp_pred = clf.predict(X[train][test_inner]) if learn_options["training_metric"] == "AUC": fpr, tpr, _ = roc_curve(y_all[learn_options["ground_truth_label"]][train][test_inner], tmp_pred) assert ~np.any(np.isnan(fpr)), "found nan fpr" assert ~np.any(np.isnan(tpr)), "found nan tpr" tmp_auc = auc(fpr, tpr) performance[i, j] += tmp_auc elif learn_options["training_metric"] == "spearmanr": spearman = util.spearmanr_nonan( y_all[learn_options["ground_truth_label"]][train][test_inner], tmp_pred.flatten() )[0] performance[i, j] += spearman elif learn_options["training_metric"] == "score": performance[i, j] += clf.score( X[test_inner], y_all[learn_options["ground_truth_label"]][train][test_inner] ) elif learn_options["training_metric"] == "NDCG": assert ( "thresh" not in learn_options["ground_truth_label"] ), "for NDCG must not use thresholded ranks, but pure ranks" # sorted = tmp_pred[np.argsort(y_all[ground_truth_label].values[test_inner])[::-1]].flatten() # sortedgt = np.sort(y_all[ground_truth_label].values[test_inner])[::-1].flatten() # tmp_perf = ranking_metrics.ndcg_at_k_ties(sorted, learn_options["NDGC_k"], sortedgt) tmp_truth = y_all[learn_options["ground_truth_label"]].values[train][test_inner].flatten() tmp_perf = ranking_metrics.ndcg_at_k_ties(tmp_truth, tmp_pred.flatten(), learn_options["NDGC_k"]) performance[i, j] += tmp_perf degenerate_pred_tmp = len(np.unique(tmp_pred)) < len(tmp_pred) / 2.0 degenerate_pred[i] += degenerate_pred_tmp # sanity checking metric wrt ties, etc. # rmse = np.sqrt(np.mean((tmp_pred - tmp_truth)**2)) tmp_pred_r, tmp_truth_r = ranking_metrics.rank_data(tmp_pred, tmp_truth) # rmse_r = np.sqrt(np.mean((tmp_pred_r-tmp_truth_r)**2)) performance /= n_folds max_score_ind = np.where(performance == np.nanmax(performance)) assert max_score_ind != len(performance), "enlarge alpha range as hitting max boundary" # assert degenerate_pred[max_score_ind[0][0]]==0, "found degenerate predictions at max score" # in the unlikely event of tied scores, take the first one. if len(max_score_ind[0]) > 1: max_score_ind = [max_score_ind[0][0], max_score_ind[1][0]] best_alpha, best_l1r = learn_options["alpha"][max_score_ind[0]], l1_ratio[max_score_ind[1]] print "\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]]) if learn_options["penalty"] == "EN": print "\t\tbest l1_ratio is %f from range=%s" % (best_l1r, l1_ratio[[0, -1]]) max_perf = np.nanmax(performance) if max_perf < 0.0: raise Exception("performance is negative") print "\t\tbest performance is %f" % max_perf clf = train_linreg_model(best_alpha, l1r, learn_options, train, X, y, y_all) if learn_options["feature_select"]: raise Exception("untested in a long time, should double check") clf, y_pred = feature_select(clf, learn_options, test, train, X, y) else: y_pred = clf.predict(X[test]) if learn_options["penalty"] != "L2": y_pred = y_pred[:, None] return y_pred, clf
def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_genes=None, CV=True): ''' feature_sets is a dictionary of "set name" to pandas.DataFrame one set might be single-nucleotide, position-independent features of order X, for e.g. Method: "GPy" or "linreg" Metric: NDCG (learning to rank metric, Normalized Discounted Cumulative Gain); AUC Output: cv_score_median, gene_rocs ''' allowed_methods = [ "GPy", "linreg", "AdaBoostRegressor", "DecisionTreeRegressor", "RandomForestRegressor", "ARDRegression", "GPy_fs", "mean", "random", "DNN", "lasso_ensemble", "doench", "logregL1", "sgrna_from_doench" ] assert learn_options[ "method"] in allowed_methods, "invalid method: %s" % learn_options[ "method"] assert learn_options["method"] == "linreg" and learn_options[ 'penalty'] == 'L2' or learn_options[ "weighted"] is None, "weighted only works with linreg L2 right now" # construct filename from options filename = construct_filename(learn_options, TEST) print "Cross-validating genes..." t2 = time.time() y = np.array(y_all[learn_options["target_name"]].values[:, None], dtype=np.float64) # concatenate feature sets in to one nparray, and get dimension of each inputs, dim, dimsum, feature_names = util.concatenate_feature_sets( feature_sets) if not CV: assert learn_options[ 'cv'] == 'gene', 'Can only use gene-CV when CV is False (I need to use all of the genes and stratified complicates that)' # set-up for cross-validation ## for outer loop, the one Doench et al use genes for if learn_options["cv"] == "stratified": assert not learn_options[ 'extra pairs'], "can't use extra pairs with stratified CV, need to figure out how to properly account for genes affected by two drugs" label_encoder = sklearn.preprocessing.LabelEncoder() label_encoder.fit(y_all['Target gene'].values) gene_classes = label_encoder.transform(y_all['Target gene'].values) if learn_options['train_genes'] is not None and learn_options[ "test_genes"] is not None: n_folds = len(learn_options["test_genes"]) else: n_folds = len(learn_options['all_genes']) cv = sklearn.cross_validation.StratifiedKFold(gene_classes, n_folds=n_folds, shuffle=True, indices=True) fold_labels = ["fold%d" % i for i in range(1, n_folds + 1)] if learn_options['num_genes_remove_train'] is not None: raise NotImplementedException() elif learn_options["cv"] == "gene": cv = [] if not CV: train_test_tmp = get_train_test( 'dummy', y_all) # get train, test split using a dummy gene train_tmp, test_tmp = train_test_tmp # not a typo, using training set to test on as well, just for this case. Test set is not used # for internal cross-val, etc. anyway. train_test_tmp = (train_tmp, train_tmp) cv.append(train_test_tmp) fold_labels = learn_options['all_genes'] elif learn_options['train_genes'] is not None and learn_options[ "test_genes"] is not None: assert learn_options['train_genes'] is not None and learn_options[ 'test_genes'] is not None, "use both or neither" for i, gene in enumerate(learn_options['test_genes']): cv.append( get_train_test(gene, y_all, learn_options['train_genes'])) fold_labels = learn_options["test_genes"] # if train and test genes are seperate, there should be only one fold train_test_disjoint = set.isdisjoint( set(learn_options["train_genes"].tolist()), set(learn_options["test_genes"].tolist())) else: for i, gene in enumerate(learn_options['all_genes']): train_test_tmp = get_train_test(gene, y_all) cv.append(train_test_tmp) fold_labels = learn_options['all_genes'] if learn_options['num_genes_remove_train'] is not None: for i, (train, test) in enumerate(cv): unique_genes = np.random.permutation( np.unique(np.unique(y_all['Target gene'][train]))) genes_to_keep = unique_genes[ 0:len(unique_genes) - learn_options['num_genes_remove_train']] guides_to_keep = [] filtered_train = [] for j, gene in enumerate(y_all['Target gene']): if j in train and gene in genes_to_keep: filtered_train.append(j) cv_i_orig = copy.deepcopy(cv[i]) cv[i] = (filtered_train, test) if learn_options['num_genes_remove_train'] == 0: assert np.all(cv_i_orig[0] == cv[i][0]) assert np.all(cv_i_orig[1] == cv[i][1]) print "# train/train after/before is %s, %s" % (len( cv[i][0]), len(cv_i_orig[0])) print "# test/test after/before is %s, %s" % (len( cv[i][1]), len(cv_i_orig[1])) else: raise Exception("invalid cv options given: %s" % learn_options["cv"]) cv = [c for c in cv] #make list from generator, so can subset for TEST case if TEST: ind_to_use = [0] #[0,1] cv = [cv[i] for i in ind_to_use] fold_labels = [fold_labels[i] for i in ind_to_use] truth = dict([(t, dict([(m, np.array([])) for m in ['raw', 'ranks', 'thrs']])) for t in fold_labels]) predictions = dict([(t, np.array([])) for t in fold_labels]) m = {} metrics = [] #do the cross-validation num_proc = learn_options["num_proc"] if num_proc > 1: num_proc = np.min([num_proc, len(cv)]) print "using multiprocessing with %d procs--one for each fold" % num_proc jobs = [] pool = multiprocessing.Pool(processes=num_proc) for i, fold in enumerate(cv): train, test = fold print "working on fold %d of %d, with %d train and %d test" % ( i, len(cv), len(train), len(test)) if learn_options["method"] == "GPy": job = pool.apply_async(models.GP.gp_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) elif learn_options["method"] == "linreg": job = pool.apply_async(models.regression.linreg_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) elif learn_options["method"] == "logregL1": job = pool.apply_async(models.regression.logreg_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) elif learn_options["method"] == "AdaBoostRegressor": job = pool.apply_async(models.ensembles.adaboost_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) elif learn_options["method"] == "DecisionTreeRegressor": job = pool.apply_async(models.ensembles.decisiontree_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) elif learn_options["method"] == "RandomForestRegressor": job = pool.apply_async(models.ensembles.randomforest_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) elif learn_options["method"] == "ARDRegression": job = pool.apply_async(models.regression.ARDRegression_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) elif learn_options["method"] == "random": job = pool.apply_async(models.baselines.random_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) elif learn_options["method"] == "mean": job = pool.apply_async(models.baselines.mean_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) elif learn_options["method"] == "DNN": job = pool.apply_async(models.DNN.DNN_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) elif learn_options["method"] == "lasso_ensemble": job = pool.apply_async( models.ensembles.LASSOs_ensemble_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) elif learn_options["method"] == "doench": job = pool.apply_async(models.baselines.doench_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) elif learn_options["method"] == "sgrna_from_doench": job = pool.apply_async( models.baselines.sgrna_from_doench_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) else: raise Exception("did not find method=%s" % learn_options["method"]) jobs.append(job) pool.close() pool.join() for i, fold in enumerate(cv): #i in range(0,len(jobs)): y_pred, m[i] = jobs[i].get() train, test = fold if learn_options["training_metric"] == "AUC": extract_fpr_tpr_for_fold( metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred) elif learn_options["training_metric"] == "NDCG": extract_NDCG_for_fold( metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred, learn_options) elif learn_options["training_metric"] == 'spearmanr': extract_spearman_for_fold( metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred, learn_options) else: raise Exception( "invalid 'training_metric' in learn_options: %s" % learn_options["training_metric"]) truth, predictions = fill_in_truth_and_predictions( truth, predictions, fold_labels[i], y_all, y_pred, learn_options, test) pool.terminate() else: # non parallel version for i, fold in enumerate(cv): train, test = fold if learn_options["method"] == "GPy": y_pred, m[i] = gp_on_fold(models.GP.feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) elif learn_options["method"] == "linreg": y_pred, m[i] = models.regression.linreg_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) elif learn_options["method"] == "logregL1": y_pred, m[i] = models.regression.logreg_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) elif learn_options["method"] == "AdaBoostRegressor": y_pred, m[i] = models.ensembles.adaboost_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) elif learn_options["method"] == "DecisionTreeRegressor": y_pred, m[i] = models.ensembles.decisiontree_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) elif learn_options["method"] == "RandomForestRegressor": y_pred, m[i] = models.ensembles.randomforest_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) elif learn_options["method"] == "ARDRegression": y_pred, m[i] = models.regression.ARDRegression_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) elif learn_options["method"] == "GPy_fs": y_pred, m[i] = models.GP.gp_with_fs_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) elif learn_options["method"] == "random": y_pred, m[i] = models.baselines.random_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) elif learn_options["method"] == "mean": y_pred, m[i] = models.baselines.mean_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) elif learn_options["method"] == "DNN": y_pred, m[i] = models.DNN.DNN_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) elif learn_options["method"] == "lasso_ensemble": y_pred, m[i] = models.ensembles.LASSOs_ensemble_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) elif learn_options["method"] == "doench": y_pred, m[i] = models.baselines.doench_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) elif learn_options["method"] == "sgrna_from_doench": y_pred, m[i] = models.baselines.sgrna_from_doench_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) else: raise Exception("invalid method found: %s" % learn_options["method"]) if learn_options["training_metric"] == "AUC": # fills in truth and predictions extract_fpr_tpr_for_fold( metrics, fold_labels[i], i, predictions, truth, y_all[learn_options['ground_truth_label']].values, test, y_pred) elif learn_options["training_metric"] == "NDCG": extract_NDCG_for_fold( metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred, learn_options) elif learn_options["training_metric"] == 'spearmanr': extract_spearman_for_fold( metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred, learn_options) truth, predictions = fill_in_truth_and_predictions( truth, predictions, fold_labels[i], y_all, y_pred, learn_options, test) print "\t\tRMSE: ", np.sqrt(((y_pred - y[test])**2).mean()) print "\t\tSpearman correlation: ", util.spearmanr_nonan( y[test], y_pred)[0] print "\t\tfinished fold/gene %i of %i" % (i, len(fold_labels)) cv_median_metric = [np.median(metrics)] gene_pred = [(truth, predictions)] print "\t\tmedian %s across gene folds: %.3f" % ( learn_options["training_metric"], cv_median_metric[-1]) t3 = time.time() print "\t\tElapsed time for cv is %.2f seconds" % (t3 - t2) return metrics, gene_pred, fold_labels, m, dimsum, filename, feature_names
def extract_spearman_for_fold(metrics, fold, i, predictions, truth, y_ground_truth, test, y_pred, learn_options): spearman = util.spearmanr_nonan(y_ground_truth[test].flatten(), y_pred.flatten())[0] assert not np.isnan(spearman), "found nan spearman" metrics.append(spearman)
def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_genes=None, CV=True): """ feature_sets is a dictionary of "set name" to pandas.DataFrame one set might be single-nucleotide, position-independent features of order X, for e.g. Method: "GPy" or "linreg" Metric: NDCG (learning to rank metric, Normalized Discounted Cumulative Gain); AUC Output: cv_score_median, gene_rocs """ allowed_methods = [ "GPy", "linreg", "AdaBoostRegressor", "DecisionTreeRegressor", "RandomForestRegressor", "ARDRegression", "GPy_fs", "mean", "random", "DNN", "lasso_ensemble", "doench", "logregL1", "sgrna_from_doench", ] assert learn_options["method"] in allowed_methods, "invalid method: %s" % learn_options["method"] assert ( learn_options["method"] == "linreg" and learn_options["penalty"] == "L2" or learn_options["weighted"] is None ), "weighted only works with linreg L2 right now" # construct filename from options filename = construct_filename(learn_options, TEST) print "Cross-validating genes..." t2 = time.time() y = np.array(y_all[learn_options["target_name"]].values[:, None], dtype=np.float64) # concatenate feature sets in to one nparray, and get dimension of each inputs, dim, dimsum, feature_names = util.concatenate_feature_sets(feature_sets) if not CV: assert ( learn_options["cv"] == "gene" ), "Can only use gene-CV when CV is False (I need to use all of the genes and stratified complicates that)" # set-up for cross-validation ## for outer loop, the one Doench et al use genes for if learn_options["cv"] == "stratified": assert not learn_options[ "extra pairs" ], "can't use extra pairs with stratified CV, need to figure out how to properly account for genes affected by two drugs" label_encoder = sklearn.preprocessing.LabelEncoder() label_encoder.fit(y_all["Target gene"].values) gene_classes = label_encoder.transform(y_all["Target gene"].values) if learn_options["train_genes"] is not None and learn_options["test_genes"] is not None: n_folds = len(learn_options["test_genes"]) else: n_folds = len(learn_options["all_genes"]) cv = sklearn.cross_validation.StratifiedKFold(gene_classes, n_folds=n_folds, shuffle=True, indices=True) fold_labels = ["fold%d" % i for i in range(1, n_folds + 1)] if learn_options["num_genes_remove_train"] is not None: raise NotImplementedException() elif learn_options["cv"] == "gene": cv = [] if not CV: train_test_tmp = get_train_test("dummy", y_all) # get train, test split using a dummy gene train_tmp, test_tmp = train_test_tmp # not a typo, using training set to test on as well, just for this case. Test set is not used # for internal cross-val, etc. anyway. train_test_tmp = (train_tmp, train_tmp) cv.append(train_test_tmp) fold_labels = learn_options["all_genes"] elif learn_options["train_genes"] is not None and learn_options["test_genes"] is not None: assert ( learn_options["train_genes"] is not None and learn_options["test_genes"] is not None ), "use both or neither" for i, gene in enumerate(learn_options["test_genes"]): cv.append(get_train_test(gene, y_all, learn_options["train_genes"])) fold_labels = learn_options["test_genes"] # if train and test genes are seperate, there should be only one fold train_test_disjoint = set.isdisjoint( set(learn_options["train_genes"].tolist()), set(learn_options["test_genes"].tolist()) ) else: for i, gene in enumerate(learn_options["all_genes"]): train_test_tmp = get_train_test(gene, y_all) cv.append(train_test_tmp) fold_labels = learn_options["all_genes"] if learn_options["num_genes_remove_train"] is not None: for i, (train, test) in enumerate(cv): unique_genes = np.random.permutation(np.unique(np.unique(y_all["Target gene"][train]))) genes_to_keep = unique_genes[0 : len(unique_genes) - learn_options["num_genes_remove_train"]] guides_to_keep = [] filtered_train = [] for j, gene in enumerate(y_all["Target gene"]): if j in train and gene in genes_to_keep: filtered_train.append(j) cv_i_orig = copy.deepcopy(cv[i]) cv[i] = (filtered_train, test) if learn_options["num_genes_remove_train"] == 0: assert np.all(cv_i_orig[0] == cv[i][0]) assert np.all(cv_i_orig[1] == cv[i][1]) print "# train/train after/before is %s, %s" % (len(cv[i][0]), len(cv_i_orig[0])) print "# test/test after/before is %s, %s" % (len(cv[i][1]), len(cv_i_orig[1])) else: raise Exception("invalid cv options given: %s" % learn_options["cv"]) cv = [c for c in cv] # make list from generator, so can subset for TEST case if TEST: ind_to_use = [0] # [0,1] cv = [cv[i] for i in ind_to_use] fold_labels = [fold_labels[i] for i in ind_to_use] truth = dict([(t, dict([(m, np.array([])) for m in ["raw", "ranks", "thrs"]])) for t in fold_labels]) predictions = dict([(t, np.array([])) for t in fold_labels]) m = {} metrics = [] # do the cross-validation num_proc = learn_options["num_proc"] if num_proc > 1: num_proc = np.min([num_proc, len(cv)]) print "using multiprocessing with %d procs--one for each fold" % num_proc jobs = [] pool = multiprocessing.Pool(processes=num_proc) for i, fold in enumerate(cv): train, test = fold print "working on fold %d of %d, with %d train and %d test" % (i, len(cv), len(train), len(test)) if learn_options["method"] == "GPy": job = pool.apply_async( models.GP.gp_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options) ) elif learn_options["method"] == "linreg": job = pool.apply_async( models.regression.linreg_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options), ) elif learn_options["method"] == "logregL1": job = pool.apply_async( models.regression.logreg_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options), ) elif learn_options["method"] == "AdaBoostRegressor": job = pool.apply_async( models.ensembles.adaboost_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options), ) elif learn_options["method"] == "DecisionTreeRegressor": job = pool.apply_async( models.ensembles.decisiontree_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options), ) elif learn_options["method"] == "RandomForestRegressor": job = pool.apply_async( models.ensembles.randomforest_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options), ) elif learn_options["method"] == "ARDRegression": job = pool.apply_async( models.regression.ARDRegression_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options), ) elif learn_options["method"] == "random": job = pool.apply_async( models.baselines.random_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options), ) elif learn_options["method"] == "mean": job = pool.apply_async( models.baselines.mean_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options), ) elif learn_options["method"] == "DNN": job = pool.apply_async( models.DNN.DNN_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options), ) elif learn_options["method"] == "lasso_ensemble": job = pool.apply_async( models.ensembles.LASSOs_ensemble_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options), ) elif learn_options["method"] == "doench": job = pool.apply_async( models.baselines.doench_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options), ) elif learn_options["method"] == "sgrna_from_doench": job = pool.apply_async( models.baselines.sgrna_from_doench_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options), ) else: raise Exception("did not find method=%s" % learn_options["method"]) jobs.append(job) pool.close() pool.join() for i, fold in enumerate(cv): # i in range(0,len(jobs)): y_pred, m[i] = jobs[i].get() train, test = fold if learn_options["training_metric"] == "AUC": extract_fpr_tpr_for_fold( metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred, ) elif learn_options["training_metric"] == "NDCG": extract_NDCG_for_fold( metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred, learn_options, ) elif learn_options["training_metric"] == "spearmanr": extract_spearman_for_fold( metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred, learn_options, ) else: raise Exception("invalid 'training_metric' in learn_options: %s" % learn_options["training_metric"]) truth, predictions = fill_in_truth_and_predictions( truth, predictions, fold_labels[i], y_all, y_pred, learn_options, test ) pool.terminate() else: # non parallel version for i, fold in enumerate(cv): train, test = fold if learn_options["method"] == "GPy": y_pred, m[i] = gp_on_fold( models.GP.feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options ) elif learn_options["method"] == "linreg": y_pred, m[i] = models.regression.linreg_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options ) elif learn_options["method"] == "logregL1": y_pred, m[i] = models.regression.logreg_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options ) elif learn_options["method"] == "AdaBoostRegressor": y_pred, m[i] = models.ensembles.adaboost_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options ) elif learn_options["method"] == "DecisionTreeRegressor": y_pred, m[i] = models.ensembles.decisiontree_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options ) elif learn_options["method"] == "RandomForestRegressor": y_pred, m[i] = models.ensembles.randomforest_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options ) elif learn_options["method"] == "ARDRegression": y_pred, m[i] = models.regression.ARDRegression_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options ) elif learn_options["method"] == "GPy_fs": y_pred, m[i] = models.GP.gp_with_fs_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options ) elif learn_options["method"] == "random": y_pred, m[i] = models.baselines.random_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options ) elif learn_options["method"] == "mean": y_pred, m[i] = models.baselines.mean_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options ) elif learn_options["method"] == "DNN": y_pred, m[i] = models.DNN.DNN_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options ) elif learn_options["method"] == "lasso_ensemble": y_pred, m[i] = models.ensembles.LASSOs_ensemble_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options ) elif learn_options["method"] == "doench": y_pred, m[i] = models.baselines.doench_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options ) elif learn_options["method"] == "sgrna_from_doench": y_pred, m[i] = models.baselines.sgrna_from_doench_on_fold( feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options ) else: raise Exception("invalid method found: %s" % learn_options["method"]) if learn_options["training_metric"] == "AUC": # fills in truth and predictions extract_fpr_tpr_for_fold( metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred, ) elif learn_options["training_metric"] == "NDCG": extract_NDCG_for_fold( metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred, learn_options, ) elif learn_options["training_metric"] == "spearmanr": extract_spearman_for_fold( metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred, learn_options, ) truth, predictions = fill_in_truth_and_predictions( truth, predictions, fold_labels[i], y_all, y_pred, learn_options, test ) print "\t\tRMSE: ", np.sqrt(((y_pred - y[test]) ** 2).mean()) print "\t\tSpearman correlation: ", util.spearmanr_nonan(y[test], y_pred)[0] print "\t\tfinished fold/gene %i of %i" % (i, len(fold_labels)) cv_median_metric = [np.median(metrics)] gene_pred = [(truth, predictions)] print "\t\tmedian %s across gene folds: %.3f" % (learn_options["training_metric"], cv_median_metric[-1]) t3 = time.time() print "\t\tElapsed time for cv is %.2f seconds" % (t3 - t2) return metrics, gene_pred, fold_labels, m, dimsum, filename, feature_names
def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options): ''' linreg using scikitlearn, using more standard regression models with penalization requiring nested-cross-validation ''' if learn_options["weighted"] is not None and ( learn_options["penalty"] != "L2" or learn_options["method"] != "linreg"): raise NotImplementedError( "weighted prediction not implemented for any methods by L2 at the moment" ) cv, n_folds = set_up_folds(learn_options, y_all.iloc[train]) if learn_options['penalty'] == "L1": l1_ratio = [1.0] elif learn_options['penalty'] == "L2": l1_ratio = [0.0] elif learn_options['penalty'] == "EN": # elastic net l1_ratio = np.linspace(0.0, 1.0, 20) performance = np.zeros((len(learn_options["alpha"]), len(l1_ratio))) degenerate_pred = np.zeros((len(learn_options["alpha"]))) for train_inner, test_inner in cv: for i, alpha in enumerate(learn_options["alpha"]): for j, l1r in enumerate(l1_ratio): clf = train_linreg_model(alpha, l1r, learn_options, train_inner, X[train], y[train], y_all.iloc[train]) if learn_options["feature_select"]: clf, tmp_pred = feature_select(clf, learn_options, test_inner, train_inner, X[train], y[train]) else: tmp_pred = clf.predict(X[train][test_inner]) if learn_options["training_metric"] == "AUC": fpr, tpr, _ = roc_curve( y_all[learn_options["ground_truth_label"]][train] [test_inner], tmp_pred) assert ~np.any(np.isnan(fpr)), "found nan fpr" assert ~np.any(np.isnan(tpr)), "found nan tpr" tmp_auc = auc(fpr, tpr) performance[i, j] += tmp_auc elif learn_options['training_metric'] == 'spearmanr': spearman = util.spearmanr_nonan( y_all[learn_options['ground_truth_label']][train] [test_inner], tmp_pred.flatten())[0] performance[i, j] += spearman elif learn_options['training_metric'] == 'score': performance[i, j] += clf.score( X[test_inner], y_all[learn_options['ground_truth_label']][train] [test_inner]) elif learn_options["training_metric"] == "NDCG": assert "thresh" not in learn_options[ "ground_truth_label"], "for NDCG must not use thresholded ranks, but pure ranks" # sorted = tmp_pred[np.argsort(y_all[ground_truth_label].values[test_inner])[::-1]].flatten() # sortedgt = np.sort(y_all[ground_truth_label].values[test_inner])[::-1].flatten() # tmp_perf = ranking_metrics.ndcg_at_k_ties(sorted, learn_options["NDGC_k"], sortedgt) tmp_truth = y_all[ learn_options["ground_truth_label"]].values[train][ test_inner].flatten() tmp_perf = ranking_metrics.ndcg_at_k_ties( tmp_truth, tmp_pred.flatten(), learn_options["NDGC_k"]) performance[i, j] += tmp_perf degenerate_pred_tmp = len( np.unique(tmp_pred)) < len(tmp_pred) / 2.0 degenerate_pred[i] += degenerate_pred_tmp # sanity checking metric wrt ties, etc. # rmse = np.sqrt(np.mean((tmp_pred - tmp_truth)**2)) tmp_pred_r, tmp_truth_r = ranking_metrics.rank_data( tmp_pred, tmp_truth) # rmse_r = np.sqrt(np.mean((tmp_pred_r-tmp_truth_r)**2)) performance /= n_folds max_score_ind = np.where(performance == np.nanmax(performance)) assert max_score_ind != len( performance), "enlarge alpha range as hitting max boundary" # assert degenerate_pred[max_score_ind[0][0]]==0, "found degenerate predictions at max score" # in the unlikely event of tied scores, take the first one. if len(max_score_ind[0]) > 1: max_score_ind = [max_score_ind[0][0], max_score_ind[1][0]] best_alpha, best_l1r = learn_options["alpha"][max_score_ind[0]], l1_ratio[ max_score_ind[1]] print "\t\tbest alpha is %f from range=%s" % ( best_alpha, learn_options["alpha"][[0, -1]]) if learn_options['penalty'] == "EN": print "\t\tbest l1_ratio is %f from range=%s" % (best_l1r, l1_ratio[[0, -1]]) max_perf = np.nanmax(performance) if max_perf < 0.0: raise Exception("performance is negative") print "\t\tbest performance is %f" % max_perf clf = train_linreg_model(best_alpha, l1r, learn_options, train, X, y, y_all) if learn_options["feature_select"]: raise Exception("untested in a long time, should double check") clf, y_pred = feature_select(clf, learn_options, test, train, X, y) else: y_pred = clf.predict(X[test]) if learn_options["penalty"] != "L2": y_pred = y_pred[:, None] return y_pred, clf