Exemplo n.º 1
0
Arquivo: ML.py Projeto: dlrsb/dream
def predictSubchallenge2(estimator, opt, output, model_output=None, features_output=None):
    # Load data
    print "loading data"
    d = load_datasets(
        opt["subchallenge"],
        opt["final"],
        opt["filterQA"],
        opt["use_mut"],
        opt["use_CNV"],
        opt["use_exp"],
        opt["use_methyl"],
        opt["use_drug_info"],
        opt["use_cell_info"],
    )
    saveObserved(d)

    if opt["use_drug_info"]:
        # Get split datasets
        # add cell_and_IDs here
        data = [
            (d["comb_train_input_None"], d["comb_train_output_None"], d["leaderboard_None"]),
            (d["comb_train_input_One"], d["comb_train_output_One"], d["leaderboard_One"]),
            (d["comb_train_input_Both"], d["comb_train_output_Both"], d["leaderboard_Both"]),
        ]
        # in this case, data is a list that contains 3 sets of X, y, LB
    else:
        data = [(d["comb_train_input"], d["comb_train_output"], d["leaderboard"])]

    if path.exists(output):
        remove(output)
    f = open(output, "a")  # append mode
    f.write("Estimator:\t" + estimator[0] + "\n\n")
    f.write("Estimator parameters:\n" + str(estimator[1].get_params()) + "\n\n")
    f.write("Options:\n")
    for key in opt:
        f.write(key + "\t" + str(opt[key]) + "\n")
    f.write("\n")

    iters_predictions = []
    feature_lists = []
    for i in xrange(len(data)):
        if len(data) > 1:
            f.write("Model " + str(i + 1) + "\n")

        # Preprocess data
        print "preprocessing data"
        X_train, LB, feature_names = preprocess(
            data[i][0],
            data[i][2],
            d,
            opt["use_mut"],
            opt["use_CNV"],
            opt["use_exp"],
            opt["exp_threshold"],
            opt["use_methyl"],
            opt["use_cell_info"],
            opt["scale"],
        )
        y_train = data[i][1].values

        # Feature selection
        print "feature selection"
        if opt["num_features"] > len(feature_names):
            selected = featureSelection(X_train, y_train, opt["selection_method"], estimator[1], "all")
        else:
            selected = featureSelection(X_train, y_train, opt["selection_method"], estimator[1], opt["num_features"])
        X_train = X_train[:, selected]
        LB = LB[:, selected]
        feature_lists.append([feature for (feature, mask) in zip(feature_names, selected) if mask])

        # Predict leaderboard/test output:
        print "predicting"
        estimator[1].fit(X_train, y_train)
        iters_predictions.append(estimator[1].predict(LB))

    if features_output != None:
        saveFeatures(feature_lists, features_output)

    # Save predictions and confidence
    if opt["use_drug_info"]:
        predictions = pd.concat(
            [
                pd.Series(data=iters_predictions[0], name="PREDICTION", index=d["leaderboard_None"].index),
                pd.Series(data=iters_predictions[1], index=d["leaderboard_One"].index),
                pd.Series(data=iters_predictions[2], index=d["leaderboard_Both"].index),
            ],
            axis=0,
        )
    else:
        predictions = pd.Series(data=iters_predictions[0], name="PREDICTION", index=d["leaderboard"].index)
    savePredictionsSubch2(predictions, d)
    saveConfidenceSubch2(d)

    # Save model
    if model_output != None:
        joblib.dump(estimator, model_output)

    f.close()
Exemplo n.º 2
0
Arquivo: ML.py Projeto: dlrsb/dream
def evaluateAndPredict(estimator, opt, output, evaluate=True, predict=False, model_output=None, features_output=None):
    # Load data
    print "loading data"
    d = load_datasets(
        opt["subchallenge"],
        opt["final"],
        opt["filterQA"],
        opt["use_mut"],
        opt["use_CNV"],
        opt["use_exp"],
        opt["use_methyl"],
        opt["use_drug_info"],
        opt["use_cell_info"],
    )
    saveObserved(d)

    if opt["use_drug_info"]:
        # Get split datasets
        data = [
            (d["comb_train_input_None"], d["comb_train_output_None"], d["leaderboard_None"]),
            (d["comb_train_input_One"], d["comb_train_output_One"], d["leaderboard_One"]),
            (d["comb_train_input_Both"], d["comb_train_output_Both"], d["leaderboard_Both"]),
        ]
        # in this case, data is a list that contains 3 sets of X, y, LB
    else:
        data = [(d["comb_train_input"], d["comb_train_output"], d["leaderboard"])]

    if path.exists(output):
        remove(output)
    f = open(output, "a")  # append mode
    f.write("Estimator:\t" + estimator[0] + "\n\n")
    f.write("Options:\n")
    for key in opt:
        f.write(key + "\t" + str(opt[key]) + "\n")
    f.write("\n")

    iters_cv_pred = []
    iters_predictions = []
    iters_confidence = []
    feature_lists = []
    for i in xrange(len(data)):
        if len(data) > 1:
            f.write("Model " + str(i + 1) + "\n")

        # Preprocess data
        print "preprocessing data"
        X_train, LB, feature_names = preprocess(
            data[i][0],
            data[i][2],
            d,
            opt["use_mut"],
            opt["use_CNV"],
            opt["use_exp"],
            opt["exp_threshold"],
            opt["use_methyl"],
            opt["use_cell_info"],
            opt["scale"],
        )
        y_train = data[i][1].values

        # Feature selection (if using SelectKBest)
        if opt["selection_method"] == "kbest":
            print "feature selection"
            if opt["num_features"] > len(feature_names):
                selected = featureSelection(X_train, y_train, opt["selection_method"], estimator[1], "all")
            else:
                selected = featureSelection(
                    X_train, y_train, opt["selection_method"], estimator[1], opt["num_features"]
                )
            X_train = X_train[:, selected]
            LB = LB[:, selected]

        # Optimize parameters (only does the actual optimization if parameters dict is not empty) and write best parameters to file
        opt_est = optParameters(estimator[1], X_train, y_train, estimator[2], data[i][0])
        f.write("Best parameters:\n" + str(opt_est[1]) + "\n\n")

        # Feature selection (if not using SelectKBest)
        if opt["selection_method"] != "kbest":
            selected = featureSelection(X_train, y_train, opt["selection_method"], opt_est[0], opt["num_features"])
            X_train = X_train[:, selected]
            LB = LB[:, selected]

        feature_lists.append([feature for (feature, mask) in zip(feature_names, selected) if mask])

        if evaluate:
            print "evaluating model"
            # Model evaluation
            predictions, confidence = evalModel(opt_est[0], X_train, y_train, 5, opt["cv_iterations"])
            iters_cv_pred.append(predictions)
            iters_confidence.append(confidence)

        if predict:
            print "predicting"
            # Predict leaderboard/test output:
            opt_est[0].fit(X_train, y_train)
            iters_predictions.append(opt_est[0].predict(LB))

    if features_output != None:
        saveFeatures(feature_lists, features_output)

    # Join dataset subsets and calculate scores/save to file
    if evaluate:
        print "calculating scores"
        if opt["use_drug_info"]:
            cv_predictions = pd.Series(
                data=iters_cv_pred[0], name="PREDICTION", index=d["comb_train_input_None"].index
            ).append(pd.Series(data=iters_cv_pred[1], name="PREDICTION", index=d["comb_train_input_One"].index))
            cv_predictions = cv_predictions.append(
                pd.Series(data=iters_cv_pred[2], name="PREDICTION", index=d["comb_train_input_Both"].index)
            )
            confidence = pd.concat(
                [
                    pd.Series(data=iters_confidence[0], name="CONFIDENCE", index=d["comb_train_input_None"].index),
                    pd.Series(data=iters_confidence[1], name="CONFIDENCE", index=d["comb_train_input_One"].index),
                    pd.Series(data=iters_confidence[2], name="CONFIDENCE", index=d["comb_train_input_Both"].index),
                ],
                axis=0,
            )
        else:
            if estimator[0] == "PLS":
                iters_cv_pred[0] = [float(iters_cv_pred[0][i]) for i in xrange(len(iters_cv_pred[0]))]
                iters_confidence[0] = [float(iters_confidence[0][i]) for i in xrange(len(iters_confidence[0]))]
            cv_predictions = pd.Series(data=iters_cv_pred[0], name="PREDICTION", index=d["comb_train_input"].index)
            confidence = pd.Series(data=iters_confidence[0], name="CONFIDENCE", index=d["comb_train_input"].index)
        savePredictions(cv_predictions, d, CV=True)
        saveConfidence(confidence, d)

        # Calculate scores and write to file
        r2 = r2_score(d["comb_train_output"].values, cv_predictions.values)
        challenge_performance = drugCombiScore(d["comb_train_output"].values, cv_predictions.values, d)
        challenge_global = globalScore(d["comb_train_output"].values, cv_predictions.values, d)
        print challenge_performance
        print challenge_global
        f.write("Model Evaluation\n")
        f.write("R2:\t" + str(r2) + "\n")
        f.write("Performance score:\t" + str(challenge_performance[0]) + "\n")
        f.write("Standard error:\t" + str(challenge_performance[1]) + "\n")
        f.write("Global score:\t" + str(challenge_global[0]) + "\n")
        f.write("Primary metric:\t" + str(challenge_global[1]) + "\n")
        f.write("Tie-breaking metric:\t" + str(challenge_global[2]) + "\n")

    if predict:
        if opt["use_drug_info"]:
            predictions = pd.concat(
                [
                    pd.Series(data=iters_predictions[0], name="PREDICTION", index=d["leaderboard_None"].index),
                    pd.Series(data=iters_predictions[1], index=d["leaderboard_One"].index),
                    pd.Series(data=iters_predictions[2], index=d["leaderboard_Both"].index),
                ],
                axis=0,
            )
        else:
            predictions = pd.Series(data=iters_predictions[0], name="PREDICTION", index=d["leaderboard"].index)
        savePredictions(predictions, d, CV=False)

    # Save model
    if model_output != None:
        joblib.dump(estimator, model_output)

    f.close()