def calculate_metrics(molecules, ensemble_lookup, filename, options): """ Determine the virtual screening performance of the ensemble :param molecules: list [mol_object_1, mol_object_2, .... ] :param ensemble: tuple (receptor_x, receptor_y, .... ) :param options: interface object that makes command line arguments available. :return: """ metric_List = [ ] # [(auc, auclow, auchigh), (fpf, ef, eflow, efhigh), (fpf, ef, eflow, efhigh), ..., ] sort_order = 'asc' # set up the appropriate score_structure data score_structure = classification.make_score_structure( molecules, ensemble_lookup[filename]) # calculate auc values auc_structure = classification.make_auc_structure(score_structure) auc = classification.calculate_auc(auc_structure, sort_order) metric_List.append(auc) # calculate enrichment factor values for fpf in make_fpfList(options): fpf = float(fpf) ef_structure = classification.make_ef_structure( score_structure, fpf, sort_order) if ef_structure: ef = classification.calculate_ef(ef_structure, fpf, None, 'include_intervals') metric_List.append(ef) if options.write_roc: output.write_roc(auc_structure, filename, options) return metric_List
def calculate_performance(molecules, ensemble, sort_order, options): """ determine the virtual screening performance of the input ensemble, and return the results in an ensemble storage object. :param molecules: :param ensemble: :param sort_order: string. either 'asc' (for binding energy estimates) or 'dsc' (for similarity scores) :param options: instance of s :return: """ es = EnsembleStorage() es.set_prop('ensemble', ensemble) # calculate the appropriate score structure type score_structure = classification.make_score_structure(molecules, ensemble) # determine auc value auc_structure = classification.make_auc_structure(score_structure) auc = classification.calculate_auc(auc_structure, sort_order, 'no stats') es.set_prop('auc', auc) # calculate enrichment factors for fpf in classification.make_fpfList(options, score_structure): fpf = float(fpf) ef_structure = classification.make_ef_structure(score_structure, fpf, sort_order) if ef_structure: ef = classification.calculate_ef(ef_structure, fpf) es.set_prop(ef[0], ef[1], 'ef') return es
def calculate_performance(molecules, ensemble, sort_order, options): """ determine the virtual screening performance of the input ensemble, and return the results in an ensemble storage object. :param molecules: :param ensemble: :param sort_order: string. either 'asc' (for binding energy estimates) or 'dsc' (for similarity scores) :param options: instance of s :return: """ es = EnsembleStorage() es.set_prop('ensemble', ensemble) # calculate the appropriate score structure type score_structure = classification.make_score_structure(molecules, ensemble) # determine auc value auc_structure = classification.make_auc_structure(score_structure) auc = classification.calculate_auc(auc_structure, sort_order, 'no stats') es.set_prop('auc', auc) # calculate enrichment factors for fpf in classification.make_fpfList(options, score_structure): fpf = float(fpf) ef_structure = classification.make_ef_structure( score_structure, fpf, sort_order) if ef_structure: ef = classification.calculate_ef(ef_structure, fpf) es.set_prop(ef[0], ef[1], 'ef') return es
def calculate_metrics(molecules, ensemble_lookup, filename, options): """ Determine the virtual screening performance of the ensemble :param molecules: list [mol_object_1, mol_object_2, .... ] :param ensemble: tuple (receptor_x, receptor_y, .... ) :param options: interface object that makes command line arguments available. :return: """ metric_List = [] # [(auc, auclow, auchigh), (fpf, ef, eflow, efhigh), (fpf, ef, eflow, efhigh), ..., ] sort_order = 'asc' # set up the appropriate score_structure data score_structure = classification.make_score_structure(molecules, ensemble_lookup[filename]) # calculate auc values auc_structure = classification.make_auc_structure(score_structure) auc = classification.calculate_auc(auc_structure, sort_order) metric_List.append(auc) # calculate enrichment factor values for fpf in make_fpfList(options): fpf = float(fpf) ef_structure = classification.make_ef_structure(score_structure, fpf, sort_order) if ef_structure: ef = classification.calculate_ef(ef_structure, fpf, None, 'include_intervals') metric_List.append(ef) if options.write_roc: output.write_roc(auc_structure, filename, options) return metric_List
def rank_queries(molecules, ensemble, sort_order, options): """ rank queries by value added to existing ensemble :param molecules: :param score_field: :param ensemble: :param sort_order: :param options: :return: """ # generate query list query_list = [ x for x in list(molecules[0].scores.keys()) if x not in ensemble ] results = {} for query in query_list: es = EnsembleStorage() # an ensemble storage project # generate test_ensemble test_ensemble = ensemble[0:] test_ensemble.append(query) test_ensemble = tuple(test_ensemble) es.set_prop('ensemble', test_ensemble) # calculate its performance score_structure = classification.make_score_structure( molecules, test_ensemble) # determine auc value auc_structure = classification.make_auc_structure(score_structure) auc = classification.calculate_auc(auc_structure, sort_order, 'no stats') es.set_prop('auc', auc) # if the enrichment factor was set to anything other than 1, then we're training to maximize the corresponding # enrichment factor for fpf in classification.make_fpfList(options, score_structure): fpf = float(fpf) ef_structure = classification.make_ef_structure( score_structure, fpf, sort_order) if ef_structure: ef = classification.calculate_ef(ef_structure, fpf) es.set_prop(ef[0], ef[1], 'ef') # append results to metric list results[test_ensemble] = es # peel away the best performing ensemble best_ensemble = screener.find_best_ensemble(results, options) return list(best_ensemble)
def rank_queries(molecules, ensemble, sort_order, options): """ rank queries by value added to existing ensemble :param molecules: :param score_field: :param ensemble: :param sort_order: :param options: :return: """ # generate query list query_list = [x for x in list(molecules[0].scores.keys()) if x not in ensemble] results = {} for query in query_list: es = EnsembleStorage() # an ensemble storage project # generate test_ensemble test_ensemble = ensemble[0:] test_ensemble.append(query) test_ensemble = tuple(test_ensemble) es.set_prop('ensemble', test_ensemble) # calculate its performance score_structure = classification.make_score_structure(molecules, test_ensemble) # determine auc value auc_structure = classification.make_auc_structure(score_structure) auc = classification.calculate_auc(auc_structure, sort_order, 'no stats') es.set_prop('auc', auc) # if the enrichment factor was set to anything other than 1, then we're training to maximize the corresponding # enrichment factor for fpf in classification.make_fpfList(options, score_structure): fpf = float(fpf) ef_structure = classification.make_ef_structure(score_structure, fpf, sort_order) if ef_structure: ef = classification.calculate_ef(ef_structure, fpf) es.set_prop(ef[0], ef[1], 'ef') # append results to metric list results[test_ensemble] = es # peel away the best performing ensemble best_ensemble = screener.find_best_ensemble(results, options) return list(best_ensemble)
def rank_queries(molecules, sort_order, options): results = {} for query in [query for query in list(molecules[0].scores.keys())]: formatted_query = [] formatted_query.append(query) formatted_query = tuple(formatted_query) es = EnsembleStorage() es.set_prop('ensemble', formatted_query) score_structure = classification.make_score_structure(molecules, formatted_query) auc_structure = classification.make_auc_structure(score_structure) auc = classification.calculate_auc(auc_structure, sort_order, 'no stats') es.set_prop('auc', auc) for fpf in classification.make_fpfList(options, score_structure): fpf = float(fpf) ef_structure = classification.make_ef_structure(score_structure, fpf, sort_order) if ef_structure: ef = classification.calculate_ef(ef_structure, fpf) es.set_prop(ef[0], ef[1], 'ef') results[formatted_query] = es return results
def compare(molecules, ensemble_lookup, options): """ compare stuff :param molecules: :param ensemble_lookup: :param options: :return: """ print(" Analyzing differences ... ") print('') sort_order = classification.get_sort_order(molecules) ensemble1 = sorted(ensemble_lookup.keys())[0] ensemble2 = sorted(ensemble_lookup.keys())[1] stats = {} stats['header'] = [' '] name = os.path.basename(ensemble1).replace('.csv', '') stats['header'].append(name) name = os.path.basename(ensemble2).replace('.csv', '') stats['header'].append(name) stats['header'].append('Difference') stats['header'].append('95% CI') stats['header'].append('p-value') molecules1 = copy.deepcopy(molecules) molecules2 = copy.deepcopy(molecules) score_structure1 = classification.make_score_structure( molecules1, ensemble_lookup[ensemble1]) score_structure2 = classification.make_score_structure( molecules2, ensemble_lookup[ensemble2]) auc_structure_1 = classification.make_auc_structure(score_structure1) auc_structure_2 = classification.make_auc_structure(score_structure2) # calculate auc value differences auc_diff = classification.calculate_auc_diff(auc_structure_1, auc_structure_2, sort_order) stats['AUC'] = auc_diff # calculate enrichment factor differences fpfList = make_fpfList(options) for fpf in fpfList: fpf = float(fpf) ef_structure1 = classification.make_ef_structure( score_structure1, fpf, sort_order) ef_structure2 = classification.make_ef_structure( score_structure2, fpf, sort_order) if ef_structure1 and ef_structure2: ef_diff = classification.calculate_ef_diff(ef_structure1, ef_structure2, fpf) title = 'E%s' % fpf stats[title] = ef_diff # write results summary output.write_diff_summary(stats, options) # write roc curves if options.write_roc: print(" Writing ROC data ... ") print('') output.write_roc(auc_structure_1, ensemble1, options) output.write_roc(auc_structure_2, ensemble2, options) # plot if options.plot: print(" Making plots ... ") print('') plotter(molecules, ensemble_lookup, options)
def plotter(molecules, ensemble_lookup, options): """ plot ROC curves for ensembles in ensemble_lookup :param molecules: :param ensemble_lookup: :param options: :return: """ try: import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt except ImportError: print("\n Plotting requires matplotlib to be installed\n") sys.exit(1) for ensemble in ensemble_lookup.keys(): # create figure fig = plt.figure() # create the queries subplot, the left subplot # create the left hand subplot ax1 = fig.add_subplot(121) for query in sorted(ensemble_lookup[ensemble]): query_list = [] query_list.append(query) score_structure = classification.make_score_structure( molecules, query_list) auc_structure = classification.make_auc_structure(score_structure) tpf = [] fpf = [] for mol in auc_structure: fpf.append(mol[4]) tpf.append(mol[5]) # add axis-labels and a title ax1.set_xlabel('FPF') ax1.set_ylabel('TPF') title = 'query performance' ax1.set_title(title) # add plot data and labels for the legend lbl = query ax1.plot(fpf, tpf, lw=3, label=lbl) # get legend handles and labels, then reverse their order handles, labels = ax1.get_legend_handles_labels() ax1.legend(handles[::-1], labels[::-1]) # add the legend ax1.legend(handles, labels, loc='best') # create the ensemble subplot, the right subplot score_structure = classification.make_score_structure( molecules, ensemble_lookup[ensemble]) auc_structure = classification.make_auc_structure(score_structure) tpf = [] fpf = [] for mol in auc_structure: fpf.append(mol[4]) tpf.append(mol[5]) # create right hand subplot ax2 = fig.add_subplot(122) # add axis-labels and a title ax2.set_xlabel('FPF') ax2.set_ylabel('TPF') title = 'ensemble performance' ax2.set_title(title) # add plot data and a label for the legend lbl = 'ensemble' ax2.plot(fpf, tpf, lw=3, label=lbl) # get legend handles and labels, then reverse their order handles, labels = ax2.get_legend_handles_labels() ax2.legend(handles[::-1], labels[::-1]) # add the legend ax2.legend(handles, labels, loc='best') # save figure figurename = options.outname + '_' + ensemble.replace('.csv', '') + '.pdf' filename = os.path.join(os.getcwd(), figurename) plt.savefig(filename, bbobx='tight', format='pdf')
def compare(molecules, ensemble_lookup, options): """ compare stuff :param molecules: :param ensemble_lookup: :param options: :return: """ print(" Analyzing differences ... ") print('') sort_order = classification.get_sort_order(molecules) ensemble1 = sorted(ensemble_lookup.keys())[0] ensemble2 = sorted(ensemble_lookup.keys())[1] stats = {} stats['header'] = [' '] name = os.path.basename(ensemble1).replace('.csv', '') stats['header'].append(name) name = os.path.basename(ensemble2).replace('.csv', '') stats['header'].append(name) stats['header'].append('Difference') stats['header'].append('95% CI') stats['header'].append('p-value') molecules1 = copy.deepcopy(molecules) molecules2 = copy.deepcopy(molecules) score_structure1 = classification.make_score_structure(molecules1, ensemble_lookup[ensemble1]) score_structure2 = classification.make_score_structure(molecules2, ensemble_lookup[ensemble2]) auc_structure_1 = classification.make_auc_structure(score_structure1) auc_structure_2 = classification.make_auc_structure(score_structure2) # calculate auc value differences auc_diff = classification.calculate_auc_diff(auc_structure_1, auc_structure_2, sort_order) stats['AUC'] = auc_diff # calculate enrichment factor differences fpfList = make_fpfList(options) for fpf in fpfList: fpf = float(fpf) ef_structure1 = classification.make_ef_structure(score_structure1, fpf, sort_order) ef_structure2 = classification.make_ef_structure(score_structure2, fpf, sort_order) if ef_structure1 and ef_structure2: ef_diff = classification.calculate_ef_diff(ef_structure1, ef_structure2, fpf) title = 'E%s' % fpf stats[title] = ef_diff # write results summary output.write_diff_summary(stats, options) # write roc curves if options.write_roc: print(" Writing ROC data ... ") print('') output.write_roc(auc_structure_1, ensemble1, options) output.write_roc(auc_structure_2, ensemble2, options) # plot if options.plot: print(" Making plots ... ") print('') plotter(molecules, ensemble_lookup, options)
def plotter(molecules, ensemble_lookup, options): """ plot ROC curves for ensembles in ensemble_lookup :param molecules: :param ensemble_lookup: :param options: :return: """ try: import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt except ImportError: print("\n Plotting requires matplotlib to be installed\n") sys.exit(1) for ensemble in ensemble_lookup.keys(): # create figure fig = plt.figure() # create the queries subplot, the left subplot # create the left hand subplot ax1 = fig.add_subplot(121) for query in sorted(ensemble_lookup[ensemble]): query_list = [] query_list.append(query) score_structure = classification.make_score_structure(molecules, query_list) auc_structure = classification.make_auc_structure(score_structure) tpf = [] fpf = [] for mol in auc_structure: fpf.append(mol[4]) tpf.append(mol[5]) # add axis-labels and a title ax1.set_xlabel('FPF') ax1.set_ylabel('TPF') title = 'query performance' ax1.set_title(title) # add plot data and labels for the legend lbl = query ax1.plot(fpf, tpf, lw=3, label=lbl) # get legend handles and labels, then reverse their order handles, labels = ax1.get_legend_handles_labels() ax1.legend(handles[::-1], labels[::-1]) # add the legend ax1.legend(handles, labels, loc='best') # create the ensemble subplot, the right subplot score_structure = classification.make_score_structure(molecules, ensemble_lookup[ensemble]) auc_structure = classification.make_auc_structure(score_structure) tpf = [] fpf = [] for mol in auc_structure: fpf.append(mol[4]) tpf.append(mol[5]) # create right hand subplot ax2 = fig.add_subplot(122) # add axis-labels and a title ax2.set_xlabel('FPF') ax2.set_ylabel('TPF') title = 'ensemble performance' ax2.set_title(title) # add plot data and a label for the legend lbl = 'ensemble' ax2.plot(fpf, tpf, lw=3, label=lbl) # get legend handles and labels, then reverse their order handles, labels = ax2.get_legend_handles_labels() ax2.legend(handles[::-1], labels[::-1]) # add the legend ax2.legend(handles, labels, loc='best') # save figure figurename = options.outname + '_' + ensemble.replace('.csv', '') + '.pdf' filename = os.path.join(os.getcwd(), figurename) plt.savefig(filename, bbobx='tight', format='pdf')