Exemplo n.º 1
0
def calculate_metrics(molecules, ensemble_lookup, filename, options):
    """
    Determine the virtual screening performance of the ensemble
    :param molecules: list [mol_object_1, mol_object_2, .... ]
    :param ensemble: tuple (receptor_x, receptor_y, .... )
    :param options: interface object that makes command line arguments available.
    :return:
    """
    metric_List = [
    ]  # [(auc, auclow, auchigh), (fpf, ef, eflow, efhigh), (fpf, ef, eflow, efhigh), ..., ]
    sort_order = 'asc'

    # set up the appropriate score_structure data
    score_structure = classification.make_score_structure(
        molecules, ensemble_lookup[filename])

    # calculate auc values
    auc_structure = classification.make_auc_structure(score_structure)
    auc = classification.calculate_auc(auc_structure, sort_order)
    metric_List.append(auc)

    # calculate enrichment factor values
    for fpf in make_fpfList(options):
        fpf = float(fpf)
        ef_structure = classification.make_ef_structure(
            score_structure, fpf, sort_order)
        if ef_structure:
            ef = classification.calculate_ef(ef_structure, fpf, None,
                                             'include_intervals')
            metric_List.append(ef)

    if options.write_roc:
        output.write_roc(auc_structure, filename, options)

    return metric_List
Exemplo n.º 2
0
def calculate_performance(molecules, ensemble, sort_order, options):
    """
    determine the virtual screening performance of the input ensemble, and return the results in an
    ensemble storage object.
    :param molecules:
    :param ensemble:
    :param sort_order: string. either 'asc' (for binding energy estimates)  or 'dsc' (for similarity scores)
    :param options: instance of s
    :return:
    """
    es = EnsembleStorage()
    es.set_prop('ensemble', ensemble)

    # calculate the appropriate score structure type
    score_structure = classification.make_score_structure(molecules, ensemble)

    # determine auc value
    auc_structure = classification.make_auc_structure(score_structure)
    auc = classification.calculate_auc(auc_structure, sort_order, 'no stats')
    es.set_prop('auc', auc)

    # calculate enrichment factors
    for fpf in classification.make_fpfList(options, score_structure):
        fpf = float(fpf)
        ef_structure = classification.make_ef_structure(score_structure, fpf, sort_order)
        if ef_structure:
            ef = classification.calculate_ef(ef_structure, fpf)
            es.set_prop(ef[0], ef[1], 'ef')
    return es
Exemplo n.º 3
0
def calculate_performance(molecules, ensemble, sort_order, options):
    """
    determine the virtual screening performance of the input ensemble, and return the results in an
    ensemble storage object.
    :param molecules:
    :param ensemble:
    :param sort_order: string. either 'asc' (for binding energy estimates)  or 'dsc' (for similarity scores)
    :param options: instance of s
    :return:
    """
    es = EnsembleStorage()
    es.set_prop('ensemble', ensemble)

    # calculate the appropriate score structure type
    score_structure = classification.make_score_structure(molecules, ensemble)

    # determine auc value
    auc_structure = classification.make_auc_structure(score_structure)
    auc = classification.calculate_auc(auc_structure, sort_order, 'no stats')
    es.set_prop('auc', auc)

    # calculate enrichment factors
    for fpf in classification.make_fpfList(options, score_structure):
        fpf = float(fpf)
        ef_structure = classification.make_ef_structure(
            score_structure, fpf, sort_order)
        if ef_structure:
            ef = classification.calculate_ef(ef_structure, fpf)
            es.set_prop(ef[0], ef[1], 'ef')
    return es
Exemplo n.º 4
0
def calculate_metrics(molecules, ensemble_lookup, filename, options):
    """
    Determine the virtual screening performance of the ensemble
    :param molecules: list [mol_object_1, mol_object_2, .... ]
    :param ensemble: tuple (receptor_x, receptor_y, .... )
    :param options: interface object that makes command line arguments available.
    :return:
    """
    metric_List = []    # [(auc, auclow, auchigh), (fpf, ef, eflow, efhigh), (fpf, ef, eflow, efhigh), ..., ]
    sort_order = 'asc'

    # set up the appropriate score_structure data
    score_structure = classification.make_score_structure(molecules, ensemble_lookup[filename])

    # calculate auc values
    auc_structure = classification.make_auc_structure(score_structure)
    auc = classification.calculate_auc(auc_structure, sort_order)
    metric_List.append(auc)

    # calculate enrichment factor values
    for fpf in make_fpfList(options):
        fpf = float(fpf)
        ef_structure = classification.make_ef_structure(score_structure, fpf, sort_order)
        if ef_structure:
            ef = classification.calculate_ef(ef_structure, fpf, None, 'include_intervals')
            metric_List.append(ef)

    if options.write_roc:
        output.write_roc(auc_structure, filename, options)

    return metric_List
Exemplo n.º 5
0
def rank_queries(molecules, ensemble, sort_order, options):
    """
    rank queries by value added to existing ensemble
    :param molecules:
    :param score_field:
    :param ensemble:
    :param sort_order:
    :param options:
    :return:
    """

    # generate query list
    query_list = [
        x for x in list(molecules[0].scores.keys()) if x not in ensemble
    ]

    results = {}

    for query in query_list:
        es = EnsembleStorage()  # an ensemble storage project

        # generate test_ensemble
        test_ensemble = ensemble[0:]
        test_ensemble.append(query)
        test_ensemble = tuple(test_ensemble)
        es.set_prop('ensemble', test_ensemble)

        # calculate its performance
        score_structure = classification.make_score_structure(
            molecules, test_ensemble)

        # determine auc value
        auc_structure = classification.make_auc_structure(score_structure)
        auc = classification.calculate_auc(auc_structure, sort_order,
                                           'no stats')
        es.set_prop('auc', auc)

        # if the enrichment factor was set to anything other than 1, then we're training to maximize the corresponding
        # enrichment factor
        for fpf in classification.make_fpfList(options, score_structure):
            fpf = float(fpf)
            ef_structure = classification.make_ef_structure(
                score_structure, fpf, sort_order)
            if ef_structure:
                ef = classification.calculate_ef(ef_structure, fpf)
                es.set_prop(ef[0], ef[1], 'ef')

        # append results to metric list
        results[test_ensemble] = es

    # peel away the best performing ensemble
    best_ensemble = screener.find_best_ensemble(results, options)

    return list(best_ensemble)
Exemplo n.º 6
0
def rank_queries(molecules, ensemble, sort_order, options):
    """
    rank queries by value added to existing ensemble
    :param molecules:
    :param score_field:
    :param ensemble:
    :param sort_order:
    :param options:
    :return:
    """

    # generate query list
    query_list = [x for x in list(molecules[0].scores.keys()) if x not in ensemble]

    results = {}

    for query in query_list:
        es = EnsembleStorage() # an ensemble storage project

        # generate test_ensemble
        test_ensemble = ensemble[0:]
        test_ensemble.append(query)
        test_ensemble = tuple(test_ensemble)
        es.set_prop('ensemble', test_ensemble)

        # calculate its performance
        score_structure = classification.make_score_structure(molecules, test_ensemble)

        # determine auc value
        auc_structure = classification.make_auc_structure(score_structure)
        auc = classification.calculate_auc(auc_structure, sort_order, 'no stats')
        es.set_prop('auc', auc)

        # if the enrichment factor was set to anything other than 1, then we're training to maximize the corresponding
        # enrichment factor
        for fpf in classification.make_fpfList(options, score_structure):
            fpf = float(fpf)
            ef_structure = classification.make_ef_structure(score_structure, fpf, sort_order)
            if ef_structure:
                ef = classification.calculate_ef(ef_structure, fpf)
                es.set_prop(ef[0], ef[1], 'ef')

        # append results to metric list
        results[test_ensemble] = es

    # peel away the best performing ensemble
    best_ensemble = screener.find_best_ensemble(results, options)

    return list(best_ensemble)
Exemplo n.º 7
0
def rank_queries(molecules, sort_order, options):
    results = {}
    for query in [query for query in list(molecules[0].scores.keys())]:
        formatted_query = []
        formatted_query.append(query)
        formatted_query = tuple(formatted_query)
        es = EnsembleStorage()
        es.set_prop('ensemble', formatted_query)
        score_structure = classification.make_score_structure(molecules, formatted_query)
        auc_structure = classification.make_auc_structure(score_structure)
        auc = classification.calculate_auc(auc_structure, sort_order, 'no stats')
        es.set_prop('auc', auc)
        for fpf in classification.make_fpfList(options, score_structure):
            fpf = float(fpf)
            ef_structure = classification.make_ef_structure(score_structure, fpf, sort_order)
            if ef_structure:
                ef = classification.calculate_ef(ef_structure, fpf)
                es.set_prop(ef[0], ef[1], 'ef')
        results[formatted_query] = es
    return results
Exemplo n.º 8
0
def compare(molecules, ensemble_lookup, options):
    """
    compare stuff
    :param molecules:
    :param ensemble_lookup:
    :param options:
    :return:
    """

    print(" Analyzing differences ... ")
    print('')
    sort_order = classification.get_sort_order(molecules)

    ensemble1 = sorted(ensemble_lookup.keys())[0]
    ensemble2 = sorted(ensemble_lookup.keys())[1]

    stats = {}
    stats['header'] = [' ']
    name = os.path.basename(ensemble1).replace('.csv', '')
    stats['header'].append(name)
    name = os.path.basename(ensemble2).replace('.csv', '')
    stats['header'].append(name)
    stats['header'].append('Difference')
    stats['header'].append('95% CI')
    stats['header'].append('p-value')

    molecules1 = copy.deepcopy(molecules)
    molecules2 = copy.deepcopy(molecules)

    score_structure1 = classification.make_score_structure(
        molecules1, ensemble_lookup[ensemble1])
    score_structure2 = classification.make_score_structure(
        molecules2, ensemble_lookup[ensemble2])

    auc_structure_1 = classification.make_auc_structure(score_structure1)
    auc_structure_2 = classification.make_auc_structure(score_structure2)

    # calculate auc value differences
    auc_diff = classification.calculate_auc_diff(auc_structure_1,
                                                 auc_structure_2, sort_order)

    stats['AUC'] = auc_diff

    # calculate enrichment factor differences
    fpfList = make_fpfList(options)
    for fpf in fpfList:
        fpf = float(fpf)
        ef_structure1 = classification.make_ef_structure(
            score_structure1, fpf, sort_order)
        ef_structure2 = classification.make_ef_structure(
            score_structure2, fpf, sort_order)

        if ef_structure1 and ef_structure2:
            ef_diff = classification.calculate_ef_diff(ef_structure1,
                                                       ef_structure2, fpf)
            title = 'E%s' % fpf
            stats[title] = ef_diff

    # write results summary
    output.write_diff_summary(stats, options)

    # write roc curves
    if options.write_roc:
        print(" Writing ROC data ... ")
        print('')
        output.write_roc(auc_structure_1, ensemble1, options)
        output.write_roc(auc_structure_2, ensemble2, options)

    # plot
    if options.plot:
        print(" Making plots ... ")
        print('')
        plotter(molecules, ensemble_lookup, options)
Exemplo n.º 9
0
def plotter(molecules, ensemble_lookup, options):
    """
    plot ROC curves for ensembles in ensemble_lookup
    :param molecules:
    :param ensemble_lookup:
    :param options:
    :return:
    """

    try:
        import matplotlib

        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
    except ImportError:
        print("\n Plotting requires matplotlib to be installed\n")
        sys.exit(1)

    for ensemble in ensemble_lookup.keys():

        # create figure
        fig = plt.figure()

        # create the queries subplot, the left subplot
        # create the left hand subplot
        ax1 = fig.add_subplot(121)

        for query in sorted(ensemble_lookup[ensemble]):
            query_list = []
            query_list.append(query)
            score_structure = classification.make_score_structure(
                molecules, query_list)
            auc_structure = classification.make_auc_structure(score_structure)
            tpf = []
            fpf = []
            for mol in auc_structure:
                fpf.append(mol[4])
                tpf.append(mol[5])

            # add axis-labels and a title
            ax1.set_xlabel('FPF')
            ax1.set_ylabel('TPF')
            title = 'query performance'
            ax1.set_title(title)

            # add plot data and labels for the legend
            lbl = query
            ax1.plot(fpf, tpf, lw=3, label=lbl)

        # get legend handles and labels, then reverse their order
        handles, labels = ax1.get_legend_handles_labels()
        ax1.legend(handles[::-1], labels[::-1])

        # add the legend
        ax1.legend(handles, labels, loc='best')

        # create the ensemble subplot, the right subplot
        score_structure = classification.make_score_structure(
            molecules, ensemble_lookup[ensemble])
        auc_structure = classification.make_auc_structure(score_structure)
        tpf = []
        fpf = []
        for mol in auc_structure:
            fpf.append(mol[4])
            tpf.append(mol[5])

        # create right hand subplot
        ax2 = fig.add_subplot(122)

        # add axis-labels and a title
        ax2.set_xlabel('FPF')
        ax2.set_ylabel('TPF')
        title = 'ensemble performance'
        ax2.set_title(title)

        # add plot data and a label for the legend
        lbl = 'ensemble'
        ax2.plot(fpf, tpf, lw=3, label=lbl)

        # get legend handles and labels, then reverse their order
        handles, labels = ax2.get_legend_handles_labels()
        ax2.legend(handles[::-1], labels[::-1])

        # add the legend
        ax2.legend(handles, labels, loc='best')

        # save figure
        figurename = options.outname + '_' + ensemble.replace('.csv',
                                                              '') + '.pdf'
        filename = os.path.join(os.getcwd(), figurename)
        plt.savefig(filename, bbobx='tight', format='pdf')
Exemplo n.º 10
0
def compare(molecules, ensemble_lookup, options):
    """
    compare stuff
    :param molecules:
    :param ensemble_lookup:
    :param options:
    :return:
    """

    print(" Analyzing differences ... ")
    print('')
    sort_order = classification.get_sort_order(molecules)

    ensemble1 = sorted(ensemble_lookup.keys())[0]
    ensemble2 = sorted(ensemble_lookup.keys())[1]

    stats = {}
    stats['header'] = [' ']
    name = os.path.basename(ensemble1).replace('.csv', '')
    stats['header'].append(name)
    name = os.path.basename(ensemble2).replace('.csv', '')
    stats['header'].append(name)
    stats['header'].append('Difference')
    stats['header'].append('95% CI')
    stats['header'].append('p-value')

    molecules1 = copy.deepcopy(molecules)
    molecules2 = copy.deepcopy(molecules)

    score_structure1 = classification.make_score_structure(molecules1, ensemble_lookup[ensemble1])
    score_structure2 = classification.make_score_structure(molecules2, ensemble_lookup[ensemble2])

    auc_structure_1 = classification.make_auc_structure(score_structure1)
    auc_structure_2 = classification.make_auc_structure(score_structure2)

    # calculate auc value differences
    auc_diff = classification.calculate_auc_diff(auc_structure_1, auc_structure_2, sort_order)

    stats['AUC'] = auc_diff

    # calculate enrichment factor differences
    fpfList = make_fpfList(options)
    for fpf in fpfList:
        fpf = float(fpf)
        ef_structure1 = classification.make_ef_structure(score_structure1, fpf, sort_order)
        ef_structure2 = classification.make_ef_structure(score_structure2, fpf, sort_order)

        if ef_structure1 and ef_structure2:
            ef_diff = classification.calculate_ef_diff(ef_structure1, ef_structure2, fpf)
            title = 'E%s' % fpf
            stats[title] = ef_diff

    # write results summary
    output.write_diff_summary(stats, options)

    # write roc curves
    if options.write_roc:
        print(" Writing ROC data ... ")
        print('')
        output.write_roc(auc_structure_1, ensemble1, options)
        output.write_roc(auc_structure_2, ensemble2, options)

    # plot
    if options.plot:
        print(" Making plots ... ")
        print('')
        plotter(molecules, ensemble_lookup, options)
Exemplo n.º 11
0
def plotter(molecules, ensemble_lookup, options):
    """
    plot ROC curves for ensembles in ensemble_lookup
    :param molecules:
    :param ensemble_lookup:
    :param options:
    :return:
    """

    try:
        import matplotlib

        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
    except ImportError:
        print("\n Plotting requires matplotlib to be installed\n")
        sys.exit(1)

    for ensemble in ensemble_lookup.keys():

        # create figure
        fig = plt.figure()

        # create the queries subplot, the left subplot
        # create the left hand subplot
        ax1 = fig.add_subplot(121)

        for query in sorted(ensemble_lookup[ensemble]):
            query_list = []
            query_list.append(query)
            score_structure = classification.make_score_structure(molecules, query_list)
            auc_structure = classification.make_auc_structure(score_structure)
            tpf = []
            fpf = []
            for mol in auc_structure:
                fpf.append(mol[4])
                tpf.append(mol[5])

            # add axis-labels and a title
            ax1.set_xlabel('FPF')
            ax1.set_ylabel('TPF')
            title = 'query performance'
            ax1.set_title(title)

            # add plot data and labels for the legend
            lbl = query
            ax1.plot(fpf, tpf, lw=3, label=lbl)

        # get legend handles and labels, then reverse their order
        handles, labels = ax1.get_legend_handles_labels()
        ax1.legend(handles[::-1], labels[::-1])

        # add the legend
        ax1.legend(handles, labels, loc='best')

        # create the ensemble subplot, the right subplot
        score_structure = classification.make_score_structure(molecules, ensemble_lookup[ensemble])
        auc_structure = classification.make_auc_structure(score_structure)
        tpf = []
        fpf = []
        for mol in auc_structure:
            fpf.append(mol[4])
            tpf.append(mol[5])

        # create right hand subplot
        ax2 = fig.add_subplot(122)

        # add axis-labels and a title
        ax2.set_xlabel('FPF')
        ax2.set_ylabel('TPF')
        title = 'ensemble performance'
        ax2.set_title(title)

        # add plot data and a label for the legend
        lbl = 'ensemble'
        ax2.plot(fpf, tpf, lw=3, label=lbl)

        # get legend handles and labels, then reverse their order
        handles, labels = ax2.get_legend_handles_labels()
        ax2.legend(handles[::-1], labels[::-1])

        # add the legend
        ax2.legend(handles, labels, loc='best')

        # save figure
        figurename = options.outname + '_' + ensemble.replace('.csv', '') + '.pdf'
        filename = os.path.join(os.getcwd(), figurename)
        plt.savefig(filename, bbobx='tight', format='pdf')