def predictSubchallenge2(estimator, opt, output, model_output=None, features_output=None): # Load data print "loading data" d = load_datasets( opt["subchallenge"], opt["final"], opt["filterQA"], opt["use_mut"], opt["use_CNV"], opt["use_exp"], opt["use_methyl"], opt["use_drug_info"], opt["use_cell_info"], ) saveObserved(d) if opt["use_drug_info"]: # Get split datasets # add cell_and_IDs here data = [ (d["comb_train_input_None"], d["comb_train_output_None"], d["leaderboard_None"]), (d["comb_train_input_One"], d["comb_train_output_One"], d["leaderboard_One"]), (d["comb_train_input_Both"], d["comb_train_output_Both"], d["leaderboard_Both"]), ] # in this case, data is a list that contains 3 sets of X, y, LB else: data = [(d["comb_train_input"], d["comb_train_output"], d["leaderboard"])] if path.exists(output): remove(output) f = open(output, "a") # append mode f.write("Estimator:\t" + estimator[0] + "\n\n") f.write("Estimator parameters:\n" + str(estimator[1].get_params()) + "\n\n") f.write("Options:\n") for key in opt: f.write(key + "\t" + str(opt[key]) + "\n") f.write("\n") iters_predictions = [] feature_lists = [] for i in xrange(len(data)): if len(data) > 1: f.write("Model " + str(i + 1) + "\n") # Preprocess data print "preprocessing data" X_train, LB, feature_names = preprocess( data[i][0], data[i][2], d, opt["use_mut"], opt["use_CNV"], opt["use_exp"], opt["exp_threshold"], opt["use_methyl"], opt["use_cell_info"], opt["scale"], ) y_train = data[i][1].values # Feature selection print "feature selection" if opt["num_features"] > len(feature_names): selected = featureSelection(X_train, y_train, opt["selection_method"], estimator[1], "all") else: selected = featureSelection(X_train, y_train, opt["selection_method"], estimator[1], opt["num_features"]) X_train = X_train[:, selected] LB = LB[:, selected] feature_lists.append([feature for (feature, mask) in zip(feature_names, selected) if mask]) # Predict leaderboard/test output: print "predicting" estimator[1].fit(X_train, y_train) iters_predictions.append(estimator[1].predict(LB)) if features_output != None: saveFeatures(feature_lists, features_output) # Save predictions and confidence if opt["use_drug_info"]: predictions = pd.concat( [ pd.Series(data=iters_predictions[0], name="PREDICTION", index=d["leaderboard_None"].index), pd.Series(data=iters_predictions[1], index=d["leaderboard_One"].index), pd.Series(data=iters_predictions[2], index=d["leaderboard_Both"].index), ], axis=0, ) else: predictions = pd.Series(data=iters_predictions[0], name="PREDICTION", index=d["leaderboard"].index) savePredictionsSubch2(predictions, d) saveConfidenceSubch2(d) # Save model if model_output != None: joblib.dump(estimator, model_output) f.close()
def evaluateAndPredict(estimator, opt, output, evaluate=True, predict=False, model_output=None, features_output=None): # Load data print "loading data" d = load_datasets( opt["subchallenge"], opt["final"], opt["filterQA"], opt["use_mut"], opt["use_CNV"], opt["use_exp"], opt["use_methyl"], opt["use_drug_info"], opt["use_cell_info"], ) saveObserved(d) if opt["use_drug_info"]: # Get split datasets data = [ (d["comb_train_input_None"], d["comb_train_output_None"], d["leaderboard_None"]), (d["comb_train_input_One"], d["comb_train_output_One"], d["leaderboard_One"]), (d["comb_train_input_Both"], d["comb_train_output_Both"], d["leaderboard_Both"]), ] # in this case, data is a list that contains 3 sets of X, y, LB else: data = [(d["comb_train_input"], d["comb_train_output"], d["leaderboard"])] if path.exists(output): remove(output) f = open(output, "a") # append mode f.write("Estimator:\t" + estimator[0] + "\n\n") f.write("Options:\n") for key in opt: f.write(key + "\t" + str(opt[key]) + "\n") f.write("\n") iters_cv_pred = [] iters_predictions = [] iters_confidence = [] feature_lists = [] for i in xrange(len(data)): if len(data) > 1: f.write("Model " + str(i + 1) + "\n") # Preprocess data print "preprocessing data" X_train, LB, feature_names = preprocess( data[i][0], data[i][2], d, opt["use_mut"], opt["use_CNV"], opt["use_exp"], opt["exp_threshold"], opt["use_methyl"], opt["use_cell_info"], opt["scale"], ) y_train = data[i][1].values # Feature selection (if using SelectKBest) if opt["selection_method"] == "kbest": print "feature selection" if opt["num_features"] > len(feature_names): selected = featureSelection(X_train, y_train, opt["selection_method"], estimator[1], "all") else: selected = featureSelection( X_train, y_train, opt["selection_method"], estimator[1], opt["num_features"] ) X_train = X_train[:, selected] LB = LB[:, selected] # Optimize parameters (only does the actual optimization if parameters dict is not empty) and write best parameters to file opt_est = optParameters(estimator[1], X_train, y_train, estimator[2], data[i][0]) f.write("Best parameters:\n" + str(opt_est[1]) + "\n\n") # Feature selection (if not using SelectKBest) if opt["selection_method"] != "kbest": selected = featureSelection(X_train, y_train, opt["selection_method"], opt_est[0], opt["num_features"]) X_train = X_train[:, selected] LB = LB[:, selected] feature_lists.append([feature for (feature, mask) in zip(feature_names, selected) if mask]) if evaluate: print "evaluating model" # Model evaluation predictions, confidence = evalModel(opt_est[0], X_train, y_train, 5, opt["cv_iterations"]) iters_cv_pred.append(predictions) iters_confidence.append(confidence) if predict: print "predicting" # Predict leaderboard/test output: opt_est[0].fit(X_train, y_train) iters_predictions.append(opt_est[0].predict(LB)) if features_output != None: saveFeatures(feature_lists, features_output) # Join dataset subsets and calculate scores/save to file if evaluate: print "calculating scores" if opt["use_drug_info"]: cv_predictions = pd.Series( data=iters_cv_pred[0], name="PREDICTION", index=d["comb_train_input_None"].index ).append(pd.Series(data=iters_cv_pred[1], name="PREDICTION", index=d["comb_train_input_One"].index)) cv_predictions = cv_predictions.append( pd.Series(data=iters_cv_pred[2], name="PREDICTION", index=d["comb_train_input_Both"].index) ) confidence = pd.concat( [ pd.Series(data=iters_confidence[0], name="CONFIDENCE", index=d["comb_train_input_None"].index), pd.Series(data=iters_confidence[1], name="CONFIDENCE", index=d["comb_train_input_One"].index), pd.Series(data=iters_confidence[2], name="CONFIDENCE", index=d["comb_train_input_Both"].index), ], axis=0, ) else: if estimator[0] == "PLS": iters_cv_pred[0] = [float(iters_cv_pred[0][i]) for i in xrange(len(iters_cv_pred[0]))] iters_confidence[0] = [float(iters_confidence[0][i]) for i in xrange(len(iters_confidence[0]))] cv_predictions = pd.Series(data=iters_cv_pred[0], name="PREDICTION", index=d["comb_train_input"].index) confidence = pd.Series(data=iters_confidence[0], name="CONFIDENCE", index=d["comb_train_input"].index) savePredictions(cv_predictions, d, CV=True) saveConfidence(confidence, d) # Calculate scores and write to file r2 = r2_score(d["comb_train_output"].values, cv_predictions.values) challenge_performance = drugCombiScore(d["comb_train_output"].values, cv_predictions.values, d) challenge_global = globalScore(d["comb_train_output"].values, cv_predictions.values, d) print challenge_performance print challenge_global f.write("Model Evaluation\n") f.write("R2:\t" + str(r2) + "\n") f.write("Performance score:\t" + str(challenge_performance[0]) + "\n") f.write("Standard error:\t" + str(challenge_performance[1]) + "\n") f.write("Global score:\t" + str(challenge_global[0]) + "\n") f.write("Primary metric:\t" + str(challenge_global[1]) + "\n") f.write("Tie-breaking metric:\t" + str(challenge_global[2]) + "\n") if predict: if opt["use_drug_info"]: predictions = pd.concat( [ pd.Series(data=iters_predictions[0], name="PREDICTION", index=d["leaderboard_None"].index), pd.Series(data=iters_predictions[1], index=d["leaderboard_One"].index), pd.Series(data=iters_predictions[2], index=d["leaderboard_Both"].index), ], axis=0, ) else: predictions = pd.Series(data=iters_predictions[0], name="PREDICTION", index=d["leaderboard"].index) savePredictions(predictions, d, CV=False) # Save model if model_output != None: joblib.dump(estimator, model_output) f.close()