def test3(self): """ test area under the curve (AUC) of ROC """ # best case auc = Scoring.CalcAUC(self.scoreBestCase, self.index) self.assertAlmostEqual(auc, 1.0, self.acc) # worst case auc = Scoring.CalcAUC(self.scoreWorstCase, self.index) self.assertAlmostEqual(auc, 0.0, self.acc) # empty list self.assertRaises(ValueError, Scoring.CalcAUC, self.scoreEmptyList, self.index) # all actives auc = Scoring.CalcAUC(self.scoreAllActives, self.index) self.assertAlmostEqual(auc, 0.0, self.acc) # all decoys auc = Scoring.CalcAUC(self.scoreAllDecoys, self.index) self.assertAlmostEqual(auc, 0.0, self.acc)
def metrics_for_target(pred, actual, mask): mask = np.array(mask, dtype=np.bool) masked_preds = pred.squeeze()[mask] order = np.flipud(np.argsort(masked_preds)) masked_oredered_actual = actual[mask][order] return Scoring.CalcEnrichment( masked_oredered_actual, 0, [.001, .005, .01, .05]) + [ Scoring.CalcAUC(masked_oredered_actual, 0), Scoring.CalcBEDROC(masked_oredered_actual, 0, 20) ]
def evaluate(activity_arr): auc = Scoring.CalcAUC(activity_arr, 0) print("AUC: ", auc) ef = Scoring.CalcEnrichment(activity_arr, 0, [0.01]) print("EF for 1%: ", ef[0]) ef = Scoring.CalcEnrichment(activity_arr, 0, [0.05]) print("EF for 5%: ", ef[0]) rie = Scoring.CalcRIE(activity_arr, 0, 100) print("RIE for 100: ", rie) bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100) print("BEDROC for 100: ", bedroc)
def evaluation(activity_arr: list, output_file: str): inputoutput_utils.create_parent_directory(output_file) auc = Scoring.CalcAUC(activity_arr, 0) ef1 = Scoring.CalcEnrichment(activity_arr, 0, [0.01]) ef5 = Scoring.CalcEnrichment(activity_arr, 0, [0.05]) rie = Scoring.CalcRIE(activity_arr, 0, 100) bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100) output = { "AUC": auc, "EF1": ef1[0], "EF5": ef5[0], "RIE": rie, "BEDROC": bedroc } with open(output_file, "w", encoding="utf-8") as stream: json.dump(output, stream)
def evaluation(activity_arr: list, output_file: str): with open(output_file, "w") as stream: auc = Scoring.CalcAUC(activity_arr, 0) stream.write("AUC: ") stream.write(str(auc)) ef = Scoring.CalcEnrichment(activity_arr, 0, [0.01]) stream.write("\nEF for 1%: ") stream.write(str(ef[0])) ef = Scoring.CalcEnrichment(activity_arr, 0, [0.05]) stream.write("\nEF for 5%: ") stream.write(str(ef[0])) rie = Scoring.CalcRIE(activity_arr, 0, 100) stream.write("\nRIE for 100: ") stream.write(str(rie)) bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100) stream.write("\nBEDROC for 100: ") stream.write(str(bedroc))
def calculate(self, score, index): return Scoring.CalcAUC(score, index)
train_fps += [fps_inact[j] for j in train_indices_inact] ys_fit = [1] * len(train_indices_act) + [0] * len(train_indices_inact) # train the model ml = BernoulliNB() ml.fit(train_fps, ys_fit) # chemical similarity simil = cPickle.load(infile) # ranking test_fps = [fps_act[j] for j in test_indices_act[i]] test_fps += [fps_inact[j] for j in test_indices_inact[i]] scores = [[pp[1], s[0], s[1]] for pp, s in zip(ml.predict_proba(test_fps), simil)] # write ranks for actives cf.writeActiveRanks(scores, rankfile, num_actives) scores.sort(reverse=True) # evaluation auc = Scoring.CalcAUC(scores, -1) ef = Scoring.CalcEnrichment(scores, -1, [0.05]) # write out outfile.write("%i\t%.10f\t%.10f\n" % (i, auc, ef[0])) infile.close() rankfile.close() outfile.close()
def screening(input_path, input_directory, ged_results_file, output_path=None): """Perform a virtual screening. :param input_path: input .json file with basic screening params :param input_directory: directory with .sdf files :param ged_results_file: .json file with GED results and parameters :param output_path: :return: """ with open(input_path) as input_stream: input_data = json.load(input_stream) # Load molecules. logging.info('Loading molecules ...') molecules = {} for file_item in input_data['files']: path = input_directory + file_item + '.sdf' if not os.path.exists(path): logging.error('Missing file: %s' % path) raise Exception('Missing file.') molecules.update(_load_molecules(path)) # Parse ged results file with open(ged_results_file) as ged_stream: ged_data = json.load(ged_stream) # Create representation of active molecules. actives = [] for active in input_data['data']['train']['ligands']: if active['name'] not in molecules: continue actives.append(molecules[active['name']]) # Screening. logging.info('Screening ...') scores = [] counter = 0 counter_max = len(input_data['data']['test']) counter_step = math.floor(counter_max / 100.0) + 1 time_begin = time.clock() for item in input_data['data']['test']: if item['name'] not in molecules: continue query = molecules[item['name']] # Counting similarity and searching for most similar active molecule similarity = 0 similarMol = query for active in actives: currentSimilarity = _ged_similarity(query, active, ged_data) if (currentSimilarity > similarity): similarity = currentSimilarity similarMol = active scores.append({ 'name': item['name'], 'similarity': similarity, 'activity': item['activity'], 'most-similar-active': similarMol.GetProp("_Name") }) #if (item['activity'] == 1) create_picture(query, similar-active) if counter % counter_step == 0: logging.debug('%d/%d', counter, counter_max) counter += 1 logging.debug('counter: ' + str(counter)) time_end = time.clock() # Evaluate screening. scores = sorted(scores, key=lambda m: m['similarity'], reverse=True) auc = Scoring.CalcAUC(scores, 'activity') ef = Scoring.CalcEnrichment(scores, 'activity', [0.005, 0.01, 0.02, 0.05]) # Print results. print('AUC : ', auc) print('EF (0.5%, 1.0%, 2.0%, 5.0%) : ', ef) total_time = float(ged_data["properties"]["time"]) / 1000 total_time += (time_end - time_begin) print('Execution time : %.2fs' % total_time) # Write result to a file. if not output_path is None and not output_path == '': if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) with open(output_path, 'w') as output_stream: json.dump( { 'properties': ged_data["properties"], 'data': scores, 'metadata': { 'auc': auc, 'ef': { '0.005': ef[0], '0.01': ef[1], '0.02': ef[2], '0.05': ef[3] }, 'fileName': os.path.basename(__file__), 'executionTime': total_time, 'definition': { 'selection': input_data['info']['selection'], 'molecules': input_data['info']['molecules'], 'index': input_data['info']['index'], 'dataset': input_data['info']['dataset'], 'method': input_data['info']['method'], 'config': 'config_file' } } }, output_stream, indent=2)
def screening(input_dir, input_directory, config_file, output_path=None): """Perform a virtual screening. :param input_dir: path to input data (training and test in .json) :param input_directory: path to sdf files :param config_file: configuration file of mcs :param output_path: directory to save the results :return: """ with open(input_dir) as input_stream: input_data = json.load(input_stream) # Load molecules. logging.info('Loading molecules ...') molecules = {} for file_item in input_data['files']: path = input_directory + file_item + '.sdf' if not os.path.exists(path): logging.error('Missing file: %s' % file_item) raise Exception('Missing file.') molecules.update(_load_molecules(path)) # Create representation of active molecules. actives = [] for active in input_data['data']['train']['ligands']: if active['name'] not in molecules: continue actives.append(molecules[active['name']]) # Screening. logging.info('Screening ...') scores = [] counter = 0 inexact = 0 counter_max = len(input_data['data']['test']) counter_step = math.floor(counter_max / 100.0) + 1 params = mcsutils._parse_config(config_file) time_begin = time.clock() for item in input_data['data']['test']: if item['name'] not in molecules: continue query = molecules[item['name']] similarity = max([mcsutils._similarity(query, active, inexact, input_data['info'], params) for active in actives]) scores.append({ 'name': item['name'], 'similarity': similarity, 'activity': item['activity'] }) if counter % counter_step == 0: logging.debug('%d/%d', counter, counter_max) #_flush_results(output_path, scores) counter += 1 #logging.debug('counter: ' + str(counter)) time_end = time.clock() # Evaluate screening. scores = sorted(scores, key=lambda m: m['similarity'], reverse=True) auc = Scoring.CalcAUC(scores, 'activity') ef = Scoring.CalcEnrichment(scores, 'activity', [0.005, 0.01, 0.02, 0.05]) # Print results. print('Input file: ', input_dir) print('Difficulty: ', input_directory) print('AUC : ', auc) print('EF (0.5%, 1.0%, 2.0%, 5.0%) : ', ef) print('Execution time : %.2fs' % (time_end - time_begin)) # Write result to a file. if not output_path is None and not output_path == '': if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) with open(output_path, 'w') as output_stream: json.dump({ 'data': scores, 'metadata': { 'auc': auc, 'ef': { '0.005': ef[0], '0.01': ef[1], '0.02': ef[2], '0.05': ef[3] }, 'fileName': os.path.basename(__file__), 'executionTime': time_end - time_begin, 'inexactMolecules': inexact, 'definition': { 'selection': input_data['info']['selection'], 'molecules': input_data['info']['molecules'], 'index': input_data['info']['index'], 'dataset': input_data['info']['dataset'], 'method': input_data['info']['method'], 'config': 'config_file' } } }, output_stream, indent=2)
test_fps = [fps_act_morgan2[j] for j in test_indices_act[i]] test_fps += [fps_inact_morgan2[j] for j in test_indices_inact[i]] scores_rf_morgan2 = [[ pp[1], s[0], s[1] ] for pp, s in zip(rf_morgan2.predict_proba(test_fps), simil)] # assign ranks scores_rf_rdk5 = cf.assignRanksWithInfo(scores_rf_rdk5) scores_lr_rdk5 = cf.assignRanksWithInfo(scores_lr_rdk5) scores_rf_morgan2 = cf.assignRanksWithInfo(scores_rf_morgan2) # fusion fusion_scores = [] for m1, m2, m3 in zip(scores_rf_rdk5, scores_lr_rdk5, scores_rf_morgan2): rank = max([m1[0], m2[0], m3[0]]) # max. rank proba = max([m1[1], m2[1], m3[1]]) # max. rank # store: [max rank, max proba, simil, info] fusion_scores.append([rank, proba, m1[2], m1[3]]) fusion_scores.sort(reverse=True) # evaluation auc = Scoring.CalcAUC(fusion_scores, -1) ef = Scoring.CalcEnrichment(fusion_scores, -1, [0.05]) # write out outfile.write("%i\t%.10f\t%.10f\n" % (i, auc, ef[0])) infile1.close() infile2.close() outfile.close()
def run_ted(input_path, input_directory, prop, output_path): """ Loads .sdf file, converts the molecules into trees with graph annotations, runs the TED, evaluates the results and saves them into a file. :param input_path: :param input_directory: :param output_path: :return: """ with open(input_path) as input_stream: input_data = json.load(input_stream) # Load molecules and convert them to tree graphs. logging.info('Loading molecules ...') molecules = {} sizes = {} bondSizes = {} for file_item in input_data['files']: path = input_directory + file_item + '.sdf' logging.debug(path) if not os.path.exists(path): logging.error('Missing file: %s' % file_item) raise Exception('Missing file.') molecules.update(_load_molecules(path, sizes, bondSizes, prop)) # Screening. logging.info('Screening ...') scores = [] counter = 0 counter_max = len(input_data['data']['test']) counter_step = math.floor(counter_max / 100.0) + 1 time_begin = time.clock() for item in input_data['data']['test']: if item['name'] not in molecules: continue query = molecules[item['name']] query_size = sizes[item['name']] query_bonds = bondSizes[item['name']] # Count pairwise similarity with all actives and choose the maximum. maxsim = 0 for active in input_data['data']['train']['ligands']: if active['name'] not in molecules: continue active_graph = molecules[active['name']] active_size = sizes[active['name']] active_bonds = bondSizes[active['name']] ted = _ted(query, active_graph, prop) sim = 1.00 - ted / float(query_size + active_size + query_bonds + active_bonds) if (sim > maxsim): maxsim = sim minted = ted scores.append({ 'name': item['name'], 'similarity': maxsim, 'activity': item['activity'], 'ted': minted }) if counter % counter_step == 0: logging.debug('%d/%d', counter, counter_max) _flush_results(output_path, scores) counter += 1 logging.debug('counter: ' + str(counter)) time_end = time.clock() logging.debug("Reached the end.") # Evaluate screening. scores = sorted(scores, key=lambda m: m['similarity'], reverse=True) auc = Scoring.CalcAUC(scores, 'activity') ef = Scoring.CalcEnrichment(scores, 'activity', [0.005, 0.01, 0.02, 0.05]) # Print results. print('AUC : ', auc) print('EF (0.5%, 1.0%, 2.0%, 5.0%) : ', ef) print('Execution time : %.2fs' % (time_end - time_begin)) # Write result to a file. if not output_path is None and not output_path == '': if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) with open(output_path, 'w') as output_stream: json.dump( { 'data': scores, 'metadata': { 'auc': auc, 'ef': { '0.005': ef[0], '0.01': ef[1], '0.02': ef[2], '0.05': ef[3] }, 'fileName': os.path.basename(__file__), 'executionTime': time_end - time_begin, 'definition': { 'selection': input_data['info']['selection'], 'molecules': input_data['info']['molecules'], 'index': input_data['info']['index'], 'dataset': input_data['info']['dataset'], 'method': input_data['info']['method'], 'config': 'config_file' } } }, output_stream, indent=2)