def predict(classmodel, regressmodel, molfile_path): print "Starting predictions for: " + molfile_path suppl = Chem.SDMolSupplier(molfile_path) mols = dict() for mol in suppl: pmol = PropertyMol.PropertyMol(mol) mols[pmol.GetProp("_Name")] = {"RDKit" : pmol} fingerprinter.appendMorganFingerprints(mols, dump=None) actives = pickle.load(open(ACTIVES_DUMP, 'rb')) found_sth = False for mol in mols: prediction = classmodel.predict(mols[mol]['fingerprint']) fingerprints_actives = utilities.getFingerprintList(actives)[0] min_distance = utilities.getMolDistFromSet(mols[mol]['fingerprint'], fingerprints_actives)[0] if min_distance <= APPLICABILITY_DOMAIN_DISTANCE_THRESHOLD and prediction[0]: print mol + " is active" print "Predicted pIC50: " + str(regressmodel.predict(mols[mol]['fingerprint'])[0]) found_sth = True if not found_sth: print "None of the molecules within the specified set was found to be active."
def trainModels(): """ Uses datamanipulation and modeling modules to train and validate models for a specific target. Use datamanipulation.params and modeling.params to set training and validation options. This method roughly proceeds as follows: 1. load ChEMBL data for molecules with IC50 (in nM) less than or equal to IC_50_THRESHOLD and convert them to RDKit molecules - all molecules are saved to tha data/ directory - if the pickle file already exists, no action is taken unless RELOAD_CHEMBL_DATA = True 2. do the same with decoys from DUD 3. merge both datasets 4. compute fingerprints 5. train Naive Bayes Classifier 6. cluster active molecules -> pick a representative compound of each cluster for training 7. use SVM to predict IC50 values of active molecules (SV Regression) 8. test regression model on the remaining molecules from the clusters """ # load actives from ChEMBL actives = {} if not os.path.exists(DATA_FOLDER_PATH): os.mkdir(DATA_FOLDER_PATH) actives_file = [x for x in os.listdir(DATA_FOLDER_PATH) if x.startswith('actives_chembl') and x.endswith('.p')] if not actives_file or RELOAD_DATA and not USE_DOWNLOADED_STRUCTS: actives = chembl.loadChEMBLData(ACCESSION, IC_50_THRESHOLD, DATA_FOLDER_PATH) else: actives = pickle.load(open(DATA_FOLDER_PATH + actives_file[0], 'rb')) if not actives_file or RELOAD_DATA and not USE_DOWNLOADED_STRUCTS: chembl.computeConsensualIC50(actives, DATA_FOLDER_PATH) chembl.appendRDKitMols(actives, DATA_FOLDER_PATH) # load decoys downloaded from DUD decoys = {} if os.path.exists(DECOYS_SDF_FILE_PATH[:-4] + ".p"): decoys = pickle.load(open(DECOYS_SDF_FILE_PATH[:-4] + ".p", 'rb')) else: if os.path.exists(DECOYS_SDF_FILE_PATH): decoys = dud.getDecoys(DECOYS_SDF_FILE_PATH) else: print "Decoys not found in: " + DECOYS_SDF_FILE_PATH print "Make sure you set the right path." exit() # merge both data sets compounds_all = {} compounds_all.update(actives) compounds_all.update(decoys) # compute Morgan fingerprints if os.path.exists(MERGED_DATASET_PATH) and not RELOAD_DATA: print "Loading previously created dataset..." compounds_all = pickle.load(open(MERGED_DATASET_PATH, 'rb')) else: fingerprinter.appendMorganFingerprints(compounds_all) actives = { cmpndid : compounds_all[cmpndid] for cmpndid in compounds_all.keys() if compounds_all[cmpndid]['active']} pickle.dump(actives, open(ACTIVES_DUMP, 'wb')) decoys = { cmpndid : compounds_all[cmpndid] for cmpndid in compounds_all.keys() if not compounds_all[cmpndid]['active']} # train and cross-validate multiple Naive Bayes Classifiers classification_results = dict() if not os.path.exists(CLASS_RESULTS_SAVE_FILE_PATH) or RELOAD_DATA: classification_results = classification.naiveBayesClassifierTraining(compounds_all) print "Saving results..." pickle.dump(classification_results, open(CLASS_RESULTS_SAVE_FILE_PATH, 'wb')) print "Finished analysis." else: print "Loading previous results..." classification_results = pickle.load(open(CLASS_RESULTS_SAVE_FILE_PATH, 'rb')) # have fun with the classification results print "# CLASSIFICATION STATISTICS #" classification.playWithResults(classification_results) # cluster actives according to their similarity and keep only the diverse molecules actives_testset = dict() if CLUSTER: clusters = utilities.clusterMols(actives) actives_kept = dict() for cluster in clusters: actives_kept[cluster[0]] = actives[cluster[0]] remains = cluster[1:] actives_filtered_out = {chmblid : actives[chmblid] for chmblid in remains} actives_testset.update(actives_filtered_out) actives = actives_kept # estimate maximum distances between active molecules to set threshold for the application domain # distance_actives = regression.estimateDistanceThreshold(actives) # median of distances between two actives # min_distance_decoys, max_distance_decoys = regression.compareDistances(actives, decoys) # average min/max distance of closest/farthest decoy from any of the actives # print "median of distances between two actives: " + str(distance_actives) # print "average min/max distance of closest/farthest decoy from any of the actives: " + str(min_distance_decoys) + "/" + str(max_distance_decoys) # Support vector regression regression_results = dict() if not os.path.exists(REGRESS_RESULTS_SAVE_FILE_PATH) or RELOAD_DATA: regression_results = regression.supportVectorRegression(actives) pickle.dump(regression_results, open(REGRESS_RESULTS_SAVE_FILE_PATH, 'wb')) else: regression_results = pickle.load(open(REGRESS_RESULTS_SAVE_FILE_PATH, 'rb')) # do something with the regression results print "# REGRESSION STATISTICS #" regression.playWithResults(regression_results, decoys, actives_testset) return classification_results['final_model'], regression_results['final_model']