Пример #1
0
def doMolGMM(molNdx):
    componentResults = []
    portionResults = []

    molName = molfiles[molNdx][1]  # [molfiles[molNdx].rfind("/", 0, -1)+1:-1]
    for portion in datasetPortion:
        t0 = time.time()
        descTypes = ["usr", "esh", "es5"]
        descType = descTypes[1]
        if portion <= 1:
            print("Loading " + str(portion * 100) + "% of " +
                  molfiles[molNdx][1])
        else:
            print("Loading " + str(portion) + " actives from " +
                  molfiles[molNdx][1])

        (test_ds, test_paths) = cu.loadDescriptors(molfiles[molNdx][0],
                                                   portion * 0.2,
                                                   dtype=descType,
                                                   active_decoy_ratio=-1,
                                                   selection_policy="RANDOM",
                                                   return_type="SEPERATE")
        numcols = test_ds[0][0].shape[1] - 2

        folds = 3

        (n_fold_ds,
         n_fold_paths) = cu.loadDescriptors(molfiles[molNdx][0],
                                            portion * 0.8,
                                            dtype=descType,
                                            active_decoy_ratio=-1,
                                            selection_policy="RANDOM",
                                            return_type="SEPARATE",
                                            exclusion_list=test_paths)

        (folds_list, excl_list) = cu.split(n_fold_ds, folds, policy="RANDOM")

        foldResults = []

        for fold in range(0, folds):

            val_ds = folds_list[fold]

            train_ds = None

            for i in range(0, folds):
                if i != fold:
                    if train_ds is None:
                        train_ds = [r[0] for r in folds_list[i]]
                    else:
                        train_ds.append([r[0] for r in folds_list[i]])

            train_ds = cu.joinDataframes(train_ds)

            numcols = train_ds.shape[1] - 2

            ann = MLPRegressor(max_iter=1000, early_stopping=True)

            ann.fit(train_ds.iloc[:, 0:numcols],
                    ((train_ds["active"])).astype(int) * 100)

            results = pd.DataFrame()

            results["score"] = [
                max(ann.predict(x[0].iloc[:, 0:numcols])) for x in val_ds
            ]
            results["truth"] = [x[2] for x in val_ds]
            auc = eval.plotSimROC(np.array(results["truth"]),
                                  np.array([results["score"]]), "", None)
            mean_ef = eval.getMeanEFs(np.array(results["truth"]),
                                      np.array([results["score"]]))
            foldResults.append((auc, mean_ef))

        print("X-Validation results: ")
        print(foldResults)

        if len(foldResults) > 0:
            mean_auc_sim = np.mean([x[0] for x in foldResults])
            std_auc_sim = np.std(np.mean([x[0] for x in foldResults]))
            mean_mean_ef_1pc = np.mean([x[1][0.01] for x in foldResults])
            std_mean_ef_1pc = np.std([x[1][0.01] for x in foldResults])
            mean_mean_ef_5pc = np.mean([x[1][0.05] for x in foldResults])
            std_mean_ef_5pc = np.std([x[1][0.05] for x in foldResults])

            print("mean AUC=" + str(mean_auc_sim) + ", std=" +
                  str(std_auc_sim) + ", mean EF(1%)=" + str(mean_mean_ef_1pc) +
                  ", std=" + str(std_mean_ef_1pc) + ", mean EF(5%)=" +
                  str(mean_mean_ef_5pc) + ", std=" + str(std_mean_ef_5pc))

            componentResults.append(
                (molName, portion, mean_auc_sim, std_auc_sim, mean_mean_ef_1pc,
                 std_mean_ef_1pc, mean_mean_ef_5pc, std_mean_ef_5pc))
        else:
            print("X-Validation returned no results. Skipping training...")
            componentResults.append((molName, portion, 0, 0, 0, 0, 0, 0))

        train_ds = cu.lumpRecords(n_fold_ds)
        ann = MLPRegressor(max_iter=1000, early_stopping=True)
        ann.fit(train_ds.iloc[:, 0:numcols],
                ((train_ds["active"])).astype(int) * 100)

        results = pd.DataFrame()

        results["score"] = [
            max(ann.predict(x[0].iloc[:, 0:numcols])) for x in test_ds
        ]
        results["truth"] = [x[2] for x in test_ds]  #np.array(test_ds)[:, 2]

        auc_sim = eval.plotSimROC(
            results["truth"], [results["score"]],
            molName + "[ANN, " + str(portion * 100) + "%]",
            "results/" + molName + "_ANN_sim_" + str(portion * 100) + ".pdf")
        auc_rank = eval.plotRankROC(
            results["truth"], [results["score"]],
            molName + "[ANN-" + str(portion * 100) + "%]",
            "results/" + molName + "_ANN_rank_" + str(portion * 100) + ".pdf")

        mean_ef = eval.getMeanEFs(np.array(results["truth"]),
                                  np.array([results["score"]]))

        print("AUC(Sim)=" + str(auc))
        print("EF: ", mean_ef)
        t1 = time.time()

        portionResults.append(
            (molName, portion, auc_sim, auc_rank, mean_ef, (t1 - t0)))

        print("Time taken = " + str(t1 - t0))

        print(componentResults)
        print(portionResults)

        f1 = open("results/results_ann_" + molName + ".txt", 'w')
        print(componentResults, file=f1)
        print(portionResults, file=f1)
        f1.close()
Пример #2
0
                                      eval_method="sim")
            foldResults.append(auc)

        print("X-Validation results: ")
        print(foldResults)

        if len(foldResults) > 0:
            print("mean F1=" + str(np.mean(foldResults)) + ", std=" +
                  str(np.std(foldResults)))

            componentResults.append((auc, mean_ef))
        else:
            print("X-Validation returned no results. Skipping training...")
            componentResults.append((0, 0))

        train_ds = cu.lumpRecords(n_fold_ds)
        svm1c = OneClassSVM()

        train_a = train_ds[train_ds["active"] == True]

        #ann.fit(train_ds.iloc[:, 0:numcols], train_ds.iloc[:, numcols])
        svm1c.fit(train_a.iloc[:, 0:numcols], None)
        #    G_a = GaussianMixture(n_components=best_components, covariance_type="full").fit(train_ds.iloc[:, 0:numcols],
        #                                                                               train_ds.iloc[:, numcols])

        results = pd.DataFrame()

        results["score"] = [
            max(svm1c.score_samples(x[0].iloc[:, 0:numcols])) for x in test_ds
        ]
        #results["a_score"] = [G_a.score(x[0].iloc[:, 0:numcols]) for x in test_ds]
def doMolIsoF(molNdx):
    componentResults = []
    xvalResults = []
    portionResults = []

    molName = molfiles[molNdx][1]  # [molfiles[molNdx].rfind("/", 0, -1)+1:-1]
    if molName in done:
        return

    for portion in datasetPortion:
        try:
            descTypes = ["usr", "esh", "es5"]
            descType = descTypes[2]
            if portion <= 1:
                print("Loading " + str(portion * 100) + "% of " +
                      molfiles[molNdx][1])
            else:
                print("Loading " + str(portion) + " actives from " +
                      molfiles[molNdx][1])

            (test_ds,
             test_paths) = cu.loadDescriptors(molfiles[molNdx][0],
                                              portion * 0.2,
                                              dtype=descType,
                                              active_decoy_ratio=-1,
                                              selection_policy="RANDOM",
                                              return_type="SEPERATE")
            numcols = test_ds[0][0].shape[1] - 2

            folds = 5

            (n_fold_ds,
             n_fold_paths) = cu.loadDescriptors(molfiles[molNdx][0],
                                                portion * 0.8,
                                                dtype=descType,
                                                active_decoy_ratio=-1,
                                                selection_policy="RANDOM",
                                                return_type="SEPARATE",
                                                exclusion_list=test_paths)

            (folds_list, excl_list) = cu.split(n_fold_ds,
                                               folds,
                                               policy="RANDOM")

            componentResults = []
            for param in params:
                foldResults = []

                for fold in range(0, folds):
                    try:
                        val_ds = folds_list[fold]

                        train_ds = None

                        for i in range(0, folds):
                            if i != fold:
                                if train_ds is None:
                                    train_ds = [r[0] for r in folds_list[i]]
                                else:
                                    train_ds.append(
                                        [r[0] for r in folds_list[i]])

                        train_ds = cu.joinDataframes(train_ds)

                        numcols = train_ds.shape[1] - 2

                        clf = IsolationForest(n_estimators=param, n_jobs=-1)

                        train_a = train_ds[train_ds["active"] == True]

                        clf.fit(train_a.iloc[:, 0:numcols], None)

                        results = pd.DataFrame()

                        results["score"] = [
                            max(
                                clf.decision_function(
                                    x[0].iloc[:, 0:numcols]).ravel())
                            for x in val_ds
                        ]

                        results["truth"] = [x[2] for x in val_ds]

                        auc = eval.plotSimROC(np.array(results["truth"]),
                                              np.array([results["score"]]), "",
                                              None)
                        mean_ef = eval.getMeanEFs(np.array(results["truth"]),
                                                  np.array([results["score"]]),
                                                  eval_method="sim")
                        foldResults.append((auc, mean_ef))
                    except:
                        foldResults.append((0, {0.01: 0, 0.05: 0}))

                print("X-Validation results: ")
                print(foldResults)

                if len(foldResults) > 0:
                    mean_auc_sim = np.mean([x[0] for x in foldResults])
                    std_auc_sim = np.std(np.mean([x[0] for x in foldResults]))
                    mean_mean_ef_1pc = np.mean(
                        [x[1][0.01] for x in foldResults])
                    std_mean_ef_1pc = np.std([x[1][0.01] for x in foldResults])
                    mean_mean_ef_5pc = np.mean(
                        [x[1][0.05] for x in foldResults])
                    std_mean_ef_5pc = np.std([x[1][0.05] for x in foldResults])

                    print("mean AUC=" + str(mean_auc_sim) + ", std=" +
                          str(std_auc_sim) + ", mean EF(1%)=" +
                          str(mean_mean_ef_1pc) + ", std=" +
                          str(std_mean_ef_1pc) + ", mean EF(5%)=" +
                          str(mean_mean_ef_5pc) + ", std=" +
                          str(std_mean_ef_5pc))

                    componentResults.append(
                        (molName, portion, param, mean_auc_sim, std_auc_sim,
                         mean_mean_ef_1pc, std_mean_ef_1pc, mean_mean_ef_5pc,
                         std_mean_ef_5pc))
                else:
                    print(
                        "X-Validation returned no results. Skipping training..."
                    )
                    componentResults.append(
                        (molName, portion, param, 0, 0, 0, 0, 0, 0))

        except:
            componentResults.append(
                (molName, portion, param, 0, 0, 0, 0, 0, 0))

        xvalResults.extend(componentResults)

        # Find best score
        aucs_rank = [x[5] for x in componentResults]

        best_estimators = params[np.argmax(aucs_rank)]
        print("Best-score estimators no.: " + str(best_estimators))

        train_ds = cu.lumpRecords(n_fold_ds)

        t0 = time.time()
        clf = IsolationForest(n_estimators=best_estimators, n_jobs=-1)

        train_a = train_ds[train_ds["active"] == True]

        clf.fit(train_a.iloc[:, 0:numcols], None)

        results = pd.DataFrame()

        results["score"] = [
            max(clf.decision_function((x[0].iloc[:, 0:numcols])))
            for x in test_ds
        ]
        results["truth"] = [x[2] for x in test_ds]  #np.array(test_ds)[:, 2]

        auc = eval.plotSimROC(
            results["truth"], [results["score"]],
            molName + "[IsoForest, " + str(portion * 100) + "%]",
            molName + "_IsoForest_sim_" + str(portion * 100) + ".pdf")
        auc_rank = eval.plotRankROC(
            results["truth"], [results["score"]],
            molName + "[IsoForest, " + str(portion * 100) + "%]",
            molName + "_IsoForest_rank_" + str(portion * 100) + ".pdf")

        auc = eval.plotSimROC(
            results["truth"], [results["score"]],
            molName + "[IsoForest, " + str(portion * 100) + "%]",
            molName + "_IsoForest_sim_" + str(portion * 100) + ".pdf")
        mean_ef = eval.getMeanEFs(np.array(results["truth"]),
                                  np.array([results["score"]]),
                                  eval_method="sim")

        print("AUC(Sim)=" + str(auc))
        print("EF: ", mean_ef)

        t1 = time.time()
        print("Time taken = " + str(t1 - t0))

        portionResults.append((molName, portion, best_estimators, auc,
                               auc_rank, mean_ef, t1 - t0))

    print(xvalResults)
    print(portionResults)

    f1 = open("results/results_isoF_" + molName + ".txt", 'w')
    print(xvalResults, file=f1)
    print(portionResults, file=f1)
    f1.close()

    full_train_dss = [x[0] for x in test_ds]
    full_train_dss.append([x[0] for x in n_fold_ds])
    full_train_ds = cu.joinDataframes(full_train_dss)
    clf = IsolationForest(n_estimators=best_estimators, n_jobs=-1)

    G_a = clf.fit(full_train_ds.iloc[:, 0:numcols],
                  full_train_ds.iloc[:, numcols])

    import pickle
    mdlf = open(molName + "_IsoForest.pkl", "wb")
    pickle.dump(G_a, mdlf)
    mdlf.close()

    print("Saved model for " + molName + " to disk")