def doMolGMM(molNdx): componentResults = [] portionResults = [] molName = molfiles[molNdx][1] # [molfiles[molNdx].rfind("/", 0, -1)+1:-1] for portion in datasetPortion: t0 = time.time() descTypes = ["usr", "esh", "es5"] descType = descTypes[1] if portion <= 1: print("Loading " + str(portion * 100) + "% of " + molfiles[molNdx][1]) else: print("Loading " + str(portion) + " actives from " + molfiles[molNdx][1]) (test_ds, test_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.2, dtype=descType, active_decoy_ratio=-1, selection_policy="RANDOM", return_type="SEPERATE") numcols = test_ds[0][0].shape[1] - 2 folds = 3 (n_fold_ds, n_fold_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.8, dtype=descType, active_decoy_ratio=-1, selection_policy="RANDOM", return_type="SEPARATE", exclusion_list=test_paths) (folds_list, excl_list) = cu.split(n_fold_ds, folds, policy="RANDOM") foldResults = [] for fold in range(0, folds): val_ds = folds_list[fold] train_ds = None for i in range(0, folds): if i != fold: if train_ds is None: train_ds = [r[0] for r in folds_list[i]] else: train_ds.append([r[0] for r in folds_list[i]]) train_ds = cu.joinDataframes(train_ds) numcols = train_ds.shape[1] - 2 ann = MLPRegressor(max_iter=1000, early_stopping=True) ann.fit(train_ds.iloc[:, 0:numcols], ((train_ds["active"])).astype(int) * 100) results = pd.DataFrame() results["score"] = [ max(ann.predict(x[0].iloc[:, 0:numcols])) for x in val_ds ] results["truth"] = [x[2] for x in val_ds] auc = eval.plotSimROC(np.array(results["truth"]), np.array([results["score"]]), "", None) mean_ef = eval.getMeanEFs(np.array(results["truth"]), np.array([results["score"]])) foldResults.append((auc, mean_ef)) print("X-Validation results: ") print(foldResults) if len(foldResults) > 0: mean_auc_sim = np.mean([x[0] for x in foldResults]) std_auc_sim = np.std(np.mean([x[0] for x in foldResults])) mean_mean_ef_1pc = np.mean([x[1][0.01] for x in foldResults]) std_mean_ef_1pc = np.std([x[1][0.01] for x in foldResults]) mean_mean_ef_5pc = np.mean([x[1][0.05] for x in foldResults]) std_mean_ef_5pc = np.std([x[1][0.05] for x in foldResults]) print("mean AUC=" + str(mean_auc_sim) + ", std=" + str(std_auc_sim) + ", mean EF(1%)=" + str(mean_mean_ef_1pc) + ", std=" + str(std_mean_ef_1pc) + ", mean EF(5%)=" + str(mean_mean_ef_5pc) + ", std=" + str(std_mean_ef_5pc)) componentResults.append( (molName, portion, mean_auc_sim, std_auc_sim, mean_mean_ef_1pc, std_mean_ef_1pc, mean_mean_ef_5pc, std_mean_ef_5pc)) else: print("X-Validation returned no results. Skipping training...") componentResults.append((molName, portion, 0, 0, 0, 0, 0, 0)) train_ds = cu.lumpRecords(n_fold_ds) ann = MLPRegressor(max_iter=1000, early_stopping=True) ann.fit(train_ds.iloc[:, 0:numcols], ((train_ds["active"])).astype(int) * 100) results = pd.DataFrame() results["score"] = [ max(ann.predict(x[0].iloc[:, 0:numcols])) for x in test_ds ] results["truth"] = [x[2] for x in test_ds] #np.array(test_ds)[:, 2] auc_sim = eval.plotSimROC( results["truth"], [results["score"]], molName + "[ANN, " + str(portion * 100) + "%]", "results/" + molName + "_ANN_sim_" + str(portion * 100) + ".pdf") auc_rank = eval.plotRankROC( results["truth"], [results["score"]], molName + "[ANN-" + str(portion * 100) + "%]", "results/" + molName + "_ANN_rank_" + str(portion * 100) + ".pdf") mean_ef = eval.getMeanEFs(np.array(results["truth"]), np.array([results["score"]])) print("AUC(Sim)=" + str(auc)) print("EF: ", mean_ef) t1 = time.time() portionResults.append( (molName, portion, auc_sim, auc_rank, mean_ef, (t1 - t0))) print("Time taken = " + str(t1 - t0)) print(componentResults) print(portionResults) f1 = open("results/results_ann_" + molName + ".txt", 'w') print(componentResults, file=f1) print(portionResults, file=f1) f1.close()
eval_method="sim") foldResults.append(auc) print("X-Validation results: ") print(foldResults) if len(foldResults) > 0: print("mean F1=" + str(np.mean(foldResults)) + ", std=" + str(np.std(foldResults))) componentResults.append((auc, mean_ef)) else: print("X-Validation returned no results. Skipping training...") componentResults.append((0, 0)) train_ds = cu.lumpRecords(n_fold_ds) svm1c = OneClassSVM() train_a = train_ds[train_ds["active"] == True] #ann.fit(train_ds.iloc[:, 0:numcols], train_ds.iloc[:, numcols]) svm1c.fit(train_a.iloc[:, 0:numcols], None) # G_a = GaussianMixture(n_components=best_components, covariance_type="full").fit(train_ds.iloc[:, 0:numcols], # train_ds.iloc[:, numcols]) results = pd.DataFrame() results["score"] = [ max(svm1c.score_samples(x[0].iloc[:, 0:numcols])) for x in test_ds ] #results["a_score"] = [G_a.score(x[0].iloc[:, 0:numcols]) for x in test_ds]
def doMolIsoF(molNdx): componentResults = [] xvalResults = [] portionResults = [] molName = molfiles[molNdx][1] # [molfiles[molNdx].rfind("/", 0, -1)+1:-1] if molName in done: return for portion in datasetPortion: try: descTypes = ["usr", "esh", "es5"] descType = descTypes[2] if portion <= 1: print("Loading " + str(portion * 100) + "% of " + molfiles[molNdx][1]) else: print("Loading " + str(portion) + " actives from " + molfiles[molNdx][1]) (test_ds, test_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.2, dtype=descType, active_decoy_ratio=-1, selection_policy="RANDOM", return_type="SEPERATE") numcols = test_ds[0][0].shape[1] - 2 folds = 5 (n_fold_ds, n_fold_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.8, dtype=descType, active_decoy_ratio=-1, selection_policy="RANDOM", return_type="SEPARATE", exclusion_list=test_paths) (folds_list, excl_list) = cu.split(n_fold_ds, folds, policy="RANDOM") componentResults = [] for param in params: foldResults = [] for fold in range(0, folds): try: val_ds = folds_list[fold] train_ds = None for i in range(0, folds): if i != fold: if train_ds is None: train_ds = [r[0] for r in folds_list[i]] else: train_ds.append( [r[0] for r in folds_list[i]]) train_ds = cu.joinDataframes(train_ds) numcols = train_ds.shape[1] - 2 clf = IsolationForest(n_estimators=param, n_jobs=-1) train_a = train_ds[train_ds["active"] == True] clf.fit(train_a.iloc[:, 0:numcols], None) results = pd.DataFrame() results["score"] = [ max( clf.decision_function( x[0].iloc[:, 0:numcols]).ravel()) for x in val_ds ] results["truth"] = [x[2] for x in val_ds] auc = eval.plotSimROC(np.array(results["truth"]), np.array([results["score"]]), "", None) mean_ef = eval.getMeanEFs(np.array(results["truth"]), np.array([results["score"]]), eval_method="sim") foldResults.append((auc, mean_ef)) except: foldResults.append((0, {0.01: 0, 0.05: 0})) print("X-Validation results: ") print(foldResults) if len(foldResults) > 0: mean_auc_sim = np.mean([x[0] for x in foldResults]) std_auc_sim = np.std(np.mean([x[0] for x in foldResults])) mean_mean_ef_1pc = np.mean( [x[1][0.01] for x in foldResults]) std_mean_ef_1pc = np.std([x[1][0.01] for x in foldResults]) mean_mean_ef_5pc = np.mean( [x[1][0.05] for x in foldResults]) std_mean_ef_5pc = np.std([x[1][0.05] for x in foldResults]) print("mean AUC=" + str(mean_auc_sim) + ", std=" + str(std_auc_sim) + ", mean EF(1%)=" + str(mean_mean_ef_1pc) + ", std=" + str(std_mean_ef_1pc) + ", mean EF(5%)=" + str(mean_mean_ef_5pc) + ", std=" + str(std_mean_ef_5pc)) componentResults.append( (molName, portion, param, mean_auc_sim, std_auc_sim, mean_mean_ef_1pc, std_mean_ef_1pc, mean_mean_ef_5pc, std_mean_ef_5pc)) else: print( "X-Validation returned no results. Skipping training..." ) componentResults.append( (molName, portion, param, 0, 0, 0, 0, 0, 0)) except: componentResults.append( (molName, portion, param, 0, 0, 0, 0, 0, 0)) xvalResults.extend(componentResults) # Find best score aucs_rank = [x[5] for x in componentResults] best_estimators = params[np.argmax(aucs_rank)] print("Best-score estimators no.: " + str(best_estimators)) train_ds = cu.lumpRecords(n_fold_ds) t0 = time.time() clf = IsolationForest(n_estimators=best_estimators, n_jobs=-1) train_a = train_ds[train_ds["active"] == True] clf.fit(train_a.iloc[:, 0:numcols], None) results = pd.DataFrame() results["score"] = [ max(clf.decision_function((x[0].iloc[:, 0:numcols]))) for x in test_ds ] results["truth"] = [x[2] for x in test_ds] #np.array(test_ds)[:, 2] auc = eval.plotSimROC( results["truth"], [results["score"]], molName + "[IsoForest, " + str(portion * 100) + "%]", molName + "_IsoForest_sim_" + str(portion * 100) + ".pdf") auc_rank = eval.plotRankROC( results["truth"], [results["score"]], molName + "[IsoForest, " + str(portion * 100) + "%]", molName + "_IsoForest_rank_" + str(portion * 100) + ".pdf") auc = eval.plotSimROC( results["truth"], [results["score"]], molName + "[IsoForest, " + str(portion * 100) + "%]", molName + "_IsoForest_sim_" + str(portion * 100) + ".pdf") mean_ef = eval.getMeanEFs(np.array(results["truth"]), np.array([results["score"]]), eval_method="sim") print("AUC(Sim)=" + str(auc)) print("EF: ", mean_ef) t1 = time.time() print("Time taken = " + str(t1 - t0)) portionResults.append((molName, portion, best_estimators, auc, auc_rank, mean_ef, t1 - t0)) print(xvalResults) print(portionResults) f1 = open("results/results_isoF_" + molName + ".txt", 'w') print(xvalResults, file=f1) print(portionResults, file=f1) f1.close() full_train_dss = [x[0] for x in test_ds] full_train_dss.append([x[0] for x in n_fold_ds]) full_train_ds = cu.joinDataframes(full_train_dss) clf = IsolationForest(n_estimators=best_estimators, n_jobs=-1) G_a = clf.fit(full_train_ds.iloc[:, 0:numcols], full_train_ds.iloc[:, numcols]) import pickle mdlf = open(molName + "_IsoForest.pkl", "wb") pickle.dump(G_a, mdlf) mdlf.close() print("Saved model for " + molName + " to disk")