def doMolIsoF(molNdx): componentResults = [] xvalResults = [] portionResults = [] molName = molfiles[molNdx][1] # [molfiles[molNdx].rfind("/", 0, -1)+1:-1] if molName in done: return for portion in datasetPortion: try: descTypes = ["usr", "esh", "es5"] descType = descTypes[2] if portion <= 1: print("Loading " + str(portion * 100) + "% of " + molfiles[molNdx][1]) else: print("Loading " + str(portion) + " actives from " + molfiles[molNdx][1]) (test_ds, test_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.2, dtype=descType, active_decoy_ratio=-1, selection_policy="RANDOM", return_type="SEPERATE") numcols = test_ds[0][0].shape[1] - 2 folds = 5 (n_fold_ds, n_fold_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.8, dtype=descType, active_decoy_ratio=-1, selection_policy="RANDOM", return_type="SEPARATE", exclusion_list=test_paths) (folds_list, excl_list) = cu.split(n_fold_ds, folds, policy="RANDOM") componentResults = [] for param in params: foldResults = [] for fold in range(0, folds): try: val_ds = folds_list[fold] train_ds = None for i in range(0, folds): if i != fold: if train_ds is None: train_ds = [ r[0].iloc[0:1, :] for r in folds_list[i] ] # Take only LEC else: train_ds.append([ r[0].iloc[0:1, :] for r in folds_list[i] ]) train_ds = cu.joinDataframes(train_ds) numcols = train_ds.shape[1] - 2 clf = IsolationForest(n_estimators=param, n_jobs=-1) train_a = train_ds[train_ds["active"] == True] clf.fit(train_a.iloc[:, 0:numcols], None) results = pd.DataFrame() results["score"] = [ max( clf.decision_function( x[0].iloc[:, 0:numcols]).ravel()) for x in val_ds ] results["truth"] = [x[2] for x in val_ds] auc = eval.plotSimROC(np.array(results["truth"]), np.array([results["score"]]), "", None) mean_ef = eval.getMeanEFs(np.array(results["truth"]), np.array([results["score"]]), eval_method="sim") foldResults.append((auc, mean_ef)) except: foldResults.append((0, {0.01: 0, 0.05: 0})) print("X-Validation results: ") print(foldResults) if len(foldResults) > 0: mean_auc_sim = np.mean([x[0] for x in foldResults]) std_auc_sim = np.std(np.mean([x[0] for x in foldResults])) mean_mean_ef_1pc = np.mean( [x[1][0.01] for x in foldResults]) std_mean_ef_1pc = np.std([x[1][0.01] for x in foldResults]) mean_mean_ef_5pc = np.mean( [x[1][0.05] for x in foldResults]) std_mean_ef_5pc = np.std([x[1][0.05] for x in foldResults]) print("mean AUC=" + str(mean_auc_sim) + ", std=" + str(std_auc_sim) + ", mean EF(1%)=" + str(mean_mean_ef_1pc) + ", std=" + str(std_mean_ef_1pc) + ", mean EF(5%)=" + str(mean_mean_ef_5pc) + ", std=" + str(std_mean_ef_5pc)) componentResults.append( (molName, portion, param, mean_auc_sim, std_auc_sim, mean_mean_ef_1pc, std_mean_ef_1pc, mean_mean_ef_5pc, std_mean_ef_5pc)) else: print( "X-Validation returned no results. Skipping training..." ) componentResults.append( (molName, portion, param, 0, 0, 0, 0, 0, 0)) except: componentResults.append( (molName, portion, param, 0, 0, 0, 0, 0, 0)) xvalResults.extend(componentResults) # Find best score aucs_rank = [x[5] for x in componentResults] best_estimators = params[np.argmax(aucs_rank)] print("Best-score estimators no.: " + str(best_estimators)) #train_ds = cu.lumpRecords(n_fold_ds) train_ds = cu.joinDataframes([r[0].iloc[0:1, :] for r in n_fold_ds]) t0 = time.time() clf = IsolationForest(n_estimators=best_estimators, n_jobs=-1) train_a = train_ds[train_ds["active"] == True] clf.fit(train_a.iloc[:, 0:numcols], None) results = pd.DataFrame() results["score"] = [ max(clf.decision_function((x[0].iloc[:, 0:numcols]))) for x in test_ds ] results["truth"] = [x[2] for x in test_ds] #np.array(test_ds)[:, 2] auc = eval.plotSimROC( results["truth"], [results["score"]], molName + "[IsoForest, " + str(portion * 100) + "%]", "results/" + molName + "_IsoForest_sim_" + str(portion * 100) + ".pdf") auc_rank = eval.plotRankROC( results["truth"], [results["score"]], molName + "[IsoForest, " + str(portion * 100) + "%]", "results/" + molName + "_IsoForest_rank_" + str(portion * 100) + ".pdf") auc = eval.plotSimROC( results["truth"], [results["score"]], molName + "[IsoForest, " + str(portion * 100) + "%]", molName + "_IsoForest_sim_" + str(portion * 100) + ".pdf") mean_ef = eval.getMeanEFs(np.array(results["truth"]), np.array([results["score"]]), eval_method="sim") print("AUC(Sim)=" + str(auc)) print("EF: ", mean_ef) t1 = time.time() print("Time taken = " + str(t1 - t0)) portionResults.append((molName, portion, best_estimators, auc, auc_rank, mean_ef, t1 - t0)) print(xvalResults) print(portionResults) f1 = open("results/results_isoF_" + molName + ".txt", 'w') print(xvalResults, file=f1) print(portionResults, file=f1) f1.close() full_train_dss = [x[0].iloc[0:1, :] for x in test_ds] full_train_dss.append([x[0].iloc[0:1, :] for x in n_fold_ds]) full_train_ds = cu.joinDataframes(full_train_dss) clf = IsolationForest(n_estimators=best_estimators, n_jobs=-1) G_a = clf.fit(full_train_ds.iloc[:, 0:numcols], full_train_ds.iloc[:, numcols]) import pickle mdlf = open("results/" + molName + "_IsoForest.pkl", "wb") pickle.dump(G_a, mdlf) mdlf.close() print("Saved model for " + molName + " to disk")
for molNdx in range(2, len(molfiles)): for portion in datasetPortion: #try: t0 = time.time() descTypes = ["usr", "esh", "es5"] descType = descTypes[1] if portion <= 1: print("Loading " + str(portion * 100) + "% of " + molfiles[molNdx][1]) else: print("Loading " + str(portion) + " actives from " + molfiles[molNdx][1]) (test_ds, test_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.2, dtype=descType, active_decoy_ratio=-1, selection_policy="RANDOM", return_type="SEPERATE") numcols = test_ds[0][0].shape[1] - 2 folds = 3 componentResults = [] (n_fold_ds, n_fold_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.8, dtype=descType, active_decoy_ratio=-1, selection_policy="RANDOM", return_type="SEPARATE", exclusion_list=test_paths)
molfiles[molNdx][1]) else: print("Loading " + str(portion) + " actives from " + molfiles[molNdx][1]) auc_esh = 0 auc_rank_esh = 0 mean_ef_esh = 0 try: print("Processing Electroshape 5-d - " + str(portion)) t0 = time.time() (sim_es5_ds, sim_paths_es5) = cu.loadDescriptors(molfiles[molNdx][0], portion, dtype="es5", active_decoy_ratio=-1, selection_policy="SEQUENTIAL", return_type="SEPARATE") simobj_es5 = scls.USRMoleculeSimParallel(sim_es5_ds, sim_paths_es5) usr_results_es5 = np.array(simobj_es5.runScreening(50)).transpose() #plotSimROC(sim_es5_ds, usr_results_es5, "es5_plot_"+molfiles[molNdx][1]+".pdf") # (auc_es5, mean_ef_es5) = eval.plotSimROC([l[2] for l in sim_ds], usr_results_es5, # molName + "ElectroShape 5-d results", # "es5_plot_"+molName + ".pdf") auc_es5 = eval.plotSimROC([l[2] for l in sim_ds], usr_results_es5, molName + " ElectroShape 5-d Sim ROC", "results/es5_sim_" + molName + ".pdf") auc_rank_es5 = eval.plotRankROC( [l[2] for l in sim_ds], usr_results_es5,
def doMolGMM(molNdx): portionResults=[] xvalResults=[] molName = molfiles[molNdx][1] # [molfiles[molNdx].rfind("/", 0, -1)+1:-1] for portion in datasetPortion: # try: descTypes = ["usr", "esh", "es5"] descType = descTypes[1] if portion <= 1: print("Loading " + str(portion * 100) + "% of " + molfiles[molNdx][1]) else: print("Loading " + str(portion) + " actives from " + molfiles[molNdx][1]) (test_ds, test_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.2, dtype=descType, active_decoy_ratio=-1, selection_policy="RANDOM", return_type="SEPERATE") numcols = test_ds[0][0].shape[1] - 2 componentsValues = [1, 10, 50, 100, 1000] folds = 5 (n_fold_ds, n_fold_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.8, dtype=descType, active_decoy_ratio=-1, selection_policy="RANDOM", return_type="SEPARATE", exclusion_list=test_paths) (folds_list, excl_list) = cu.split(n_fold_ds, folds, policy="RANDOM") componentResults = [] for components in componentsValues: foldResults = [] for fold in range(0, folds): val_ds = folds_list[fold] train_ds = None; for i in range(0, folds): if i != fold: if train_ds is None: train_ds = [r[0].iloc[0:1,:] for r in folds_list[i]] else: train_ds.append([r[0].iloc[0:1,:] for r in folds_list[i]]) train_ds = cu.joinDataframes(train_ds) numcols = train_ds.shape[1] - 2 train_a = train_ds[train_ds["active"] == True] # train_d = train_ds[train_ds["active"]==False] if len(train_a) > components: # print("Generating GMM for actives...") G_a = GaussianMixture(n_components=components, covariance_type="full").fit( train_a.iloc[:, 0:numcols], train_a.iloc[:, numcols]) results = pd.DataFrame() print(numcols) results["a_score"] = [G_a.score(x[0].iloc[:, 0:numcols]) for x in val_ds] # map(lambda x: G_a.score(x[0].iloc[:, 0:12]), test_ds) results["truth"] = [x[2] for x in val_ds] # np.array(val_ds)[:,2] auc = eval.plotSimROC(np.array(results["truth"]), np.array([results["a_score"]]), "", None) mean_ef = eval.getMeanEFs(np.array(results["truth"]), np.array([results["a_score"]])) foldResults.append((auc, mean_ef)) else: print("Training samples(" + str(len(train_a)) + ") < GMM components(" + str( components) + ") -> cannot train.") break # foldResults.append(0) print("X-Validation results, num components = " + str(components) + ": ") print(foldResults) if len(foldResults) > 0: mean_auc_sim = np.mean([x[0] for x in foldResults]) std_auc_sim = np.std(np.mean([x[0] for x in foldResults])) mean_mean_ef_1pc = np.mean([x[1][0.01] for x in foldResults]) std_mean_ef_1pc = np.std([x[1][0.01] for x in foldResults]) mean_mean_ef_5pc = np.mean([x[1][0.05] for x in foldResults]) std_mean_ef_5pc = np.std([x[1][0.05] for x in foldResults]) print("mean AUC=" + str(mean_auc_sim) + ", std=" + str(std_auc_sim) + ", mean EF(1%)=" + str(mean_mean_ef_1pc) + ", std=" + str(std_mean_ef_1pc) + ", mean EF(5%)=" + str(mean_mean_ef_5pc) + ", std=" + str(std_mean_ef_5pc)) componentResults.append((molName, portion, components, mean_auc_sim, std_auc_sim, mean_mean_ef_1pc, std_mean_ef_1pc, mean_mean_ef_5pc, std_mean_ef_5pc)) else: print( "X-Validation returned no results for " + str(components) + " components. Skipping training...") componentResults.append((molName, portion, components, 0, 0, 0, 0, 0, 0)) # print(componentResults) xvalResults.extend(componentResults) # Find best score aucs_rank = [x[5] for x in componentResults] best_components = componentsValues[np.argmax(aucs_rank)] print("Best-score compnents no.: " + str(best_components)) (train_ds, train_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.8, dtype=descType, active_decoy_ratio=0, selection_policy="RANDOM", return_type="SEPARATE", exclusion_list=test_paths) train_ds = [x[0][0:1,:] for x in train_ds] #extract LECs # molName = molfiles[molNdx][1]#[molfiles[molNdx].rfind("/", 0, -1)+1:-1] if len(train_ds) > best_components: t0 = time.time() G_a = GaussianMixture(n_components=best_components, covariance_type="full").fit( train_ds.iloc[:, 0:numcols], train_ds.iloc[:, numcols]) results = pd.DataFrame() results["a_score"] = [G_a.score(x[0].iloc[:, 0:numcols]) for x in test_ds] results["truth"] = [x[2] for x in test_ds] # np.array(test_ds)[:, 2] auc = eval.plotSimROC(results["truth"], [results["a_score"]], molName + "[GMM-" + str(components) + " components(Similarity), " + str( portion * 100) + "%]", "results/"+molName + "_GMM_sim_" + str(components) + "_" + str(portion * 100) + "_LEC.pdf") auc_rank = eval.plotRankROC(results["truth"], [results["a_score"]], molName + "[GMM-" + str(components) + " components(Similarity), " + str( portion * 100) + "%]", "results/"+molName + "_GMM_sim_" + str(components) + "_" + str(portion * 100) + "_LEC.pdf") mean_ef = eval.getMeanEFs(np.array(results["truth"]), np.array([results["a_score"]])) t1 = time.time() else: auc = 0 mean_ef = 0 print("Final results, num components = ", str(components) + ": ") print("AUC=" + str(auc)) print("EF: ", mean_ef) portionResults.append((molName, portion, best_components, auc, auc_rank, mean_ef, t1 - t0)) # except: # print("Exception") # portionResults.append((molName, portion, 0, 0, 0, 0, 0)) f1 = open("results/results_gmm_"+molName+"_LEC.txt", 'w') print(xvalResults, file=f1) print(portionResults, file=f1) f1.close() full_train_dss = [x[0].iloc[0:1,:] for x in test_ds] full_train_dss.append([x[0].iloc[0:1,:] for x in n_fold_ds]) full_train_ds = cu.joinDataframes(full_train_dss) G_a = GaussianMixture(n_components=best_components, covariance_type="full").fit( full_train_ds.iloc[:, 0:numcols], full_train_ds.iloc[:, numcols]) import pickle mdlf = open("results/"+molName + "_GMM_LEC.pkl", "wb") pickle.dump(G_a, mdlf) mdlf.close() print("Saved model for " + molName + " to disk")
def doMolGMM(molNdx): componentResults = [] portionResults = [] molName = molfiles[molNdx][1] # [molfiles[molNdx].rfind("/", 0, -1)+1:-1] for portion in datasetPortion: t0 = time.time() descTypes = ["usr", "esh", "es5"] descType = descTypes[1] if portion <= 1: print("Loading " + str(portion * 100) + "% of " + molfiles[molNdx][1]) else: print("Loading " + str(portion) + " actives from " + molfiles[molNdx][1]) (test_ds, test_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.2, dtype=descType, active_decoy_ratio=-1, selection_policy="RANDOM", return_type="SEPERATE") numcols = test_ds[0][0].shape[1] - 2 folds = 3 (n_fold_ds, n_fold_paths) = cu.loadDescriptors(molfiles[molNdx][0], portion * 0.8, dtype=descType, active_decoy_ratio=-1, selection_policy="RANDOM", return_type="SEPARATE", exclusion_list=test_paths) (folds_list, excl_list) = cu.split(n_fold_ds, folds, policy="RANDOM") foldResults = [] for fold in range(0, folds): val_ds = folds_list[fold] train_ds = None for i in range(0, folds): if i != fold: if train_ds is None: train_ds = [r[0] for r in folds_list[i]] else: train_ds.append([r[0] for r in folds_list[i]]) train_ds = cu.joinDataframes(train_ds) numcols = train_ds.shape[1] - 2 ann = MLPRegressor(max_iter=1000, early_stopping=True) ann.fit(train_ds.iloc[:, 0:numcols], ((train_ds["active"])).astype(int) * 100) results = pd.DataFrame() results["score"] = [ max(ann.predict(x[0].iloc[:, 0:numcols])) for x in val_ds ] results["truth"] = [x[2] for x in val_ds] auc = eval.plotSimROC(np.array(results["truth"]), np.array([results["score"]]), "", None) mean_ef = eval.getMeanEFs(np.array(results["truth"]), np.array([results["score"]])) foldResults.append((auc, mean_ef)) print("X-Validation results: ") print(foldResults) if len(foldResults) > 0: mean_auc_sim = np.mean([x[0] for x in foldResults]) std_auc_sim = np.std(np.mean([x[0] for x in foldResults])) mean_mean_ef_1pc = np.mean([x[1][0.01] for x in foldResults]) std_mean_ef_1pc = np.std([x[1][0.01] for x in foldResults]) mean_mean_ef_5pc = np.mean([x[1][0.05] for x in foldResults]) std_mean_ef_5pc = np.std([x[1][0.05] for x in foldResults]) print("mean AUC=" + str(mean_auc_sim) + ", std=" + str(std_auc_sim) + ", mean EF(1%)=" + str(mean_mean_ef_1pc) + ", std=" + str(std_mean_ef_1pc) + ", mean EF(5%)=" + str(mean_mean_ef_5pc) + ", std=" + str(std_mean_ef_5pc)) componentResults.append( (molName, portion, mean_auc_sim, std_auc_sim, mean_mean_ef_1pc, std_mean_ef_1pc, mean_mean_ef_5pc, std_mean_ef_5pc)) else: print("X-Validation returned no results. Skipping training...") componentResults.append((molName, portion, 0, 0, 0, 0, 0, 0)) train_ds = cu.lumpRecords(n_fold_ds) ann = MLPRegressor(max_iter=1000, early_stopping=True) ann.fit(train_ds.iloc[:, 0:numcols], ((train_ds["active"])).astype(int) * 100) results = pd.DataFrame() results["score"] = [ max(ann.predict(x[0].iloc[:, 0:numcols])) for x in test_ds ] results["truth"] = [x[2] for x in test_ds] #np.array(test_ds)[:, 2] auc_sim = eval.plotSimROC( results["truth"], [results["score"]], molName + "[ANN, " + str(portion * 100) + "%]", "results/" + molName + "_ANN_sim_" + str(portion * 100) + ".pdf") auc_rank = eval.plotRankROC( results["truth"], [results["score"]], molName + "[ANN-" + str(portion * 100) + "%]", "results/" + molName + "_ANN_rank_" + str(portion * 100) + ".pdf") mean_ef = eval.getMeanEFs(np.array(results["truth"]), np.array([results["score"]])) print("AUC(Sim)=" + str(auc)) print("EF: ", mean_ef) t1 = time.time() portionResults.append( (molName, portion, auc_sim, auc_rank, mean_ef, (t1 - t0))) print("Time taken = " + str(t1 - t0)) print(componentResults) print(portionResults) f1 = open("results/results_ann_" + molName + ".txt", 'w') print(componentResults, file=f1) print(portionResults, file=f1) f1.close()
molNdx=0 #(sim_ds, sim_paths) = cu.loadDescriptors(molfiles[molNdx][0], numActives, dtype="usr", active_decoy_ratio=-1, selection_policy="SEQUENTIAL", return_type="SEPARATE") #(sim_es_ds, sim_paths_es) = cu.loadDescriptors(molfiles[molNdx][0], numActives, dtype="esh", active_decoy_ratio=-1, selection_policy="SEQUENTIAL", return_type="SEPARATE") #(sim_es5_ds, sim_paths_es5) = cu.loadDescriptors(molfiles[molNdx][0], numActives, dtype="es5", active_decoy_ratio=-1, selection_policy="SEQUENTIAL", return_type="SEPARATE") results = [] for molNdx in range(0, len(molfiles)): molName = molfiles[molNdx][1] try: print("Processing "+molfiles[molNdx][0]) print("Processing USR") sc = initSpark() (sim_ds, sim_paths) = cu.loadDescriptors(molfiles[molNdx][0], numActives, dtype="usr", active_decoy_ratio=-1, selection_policy="SEQUENTIAL", return_type="SEPARATE") simobj = scls.USRMoleculeSim(sim_ds, sim_paths) usr_results = np.array(simobj.runSparkScreening(sc)).transpose() sc.stop() #plotSimROC(sim_ds, usr_results, "usr_plot_"+molfiles[molNdx][1]+".pdf") auc_usr = eval.plotSimROC([l[2] for l in sim_ds], usr_results, molName + " USR Sim ROC", "usr_sim_"+molName + ".pdf") auc_rank_usr = eval.plotRankROC([l[2] for l in sim_ds], usr_results, molName + " USR Rank ROC", "usr_rank_"+molName + ".pdf") mean_ef_usr = eval.getMeanEFs([l[2] for l in sim_ds], usr_results) except: print("Error processing USR for " + molfiles[molNdx][1]) auc_usr=0 auc_rank_usr=0