Exemplo n.º 1
0
def optimizor(molecules, sort_order, ensemble_size, options):
    """
	Evaluate the performance of all ensembles of fixed size.
	"""
    # set variables
    ncpu = options.ncpu
    score_field = options.score_field

    # generate an exhaustive list of all possible ensembles
    ensemble_list = make_ensemble_list(molecules, score_field, ensemble_size)

    # set number of processors.
    if not ncpu:
        ncpu = multiprocessing.cpu_count()

    if ncpu > 1:
        print("Determining the performance of {d} ensembles using {n} processors".format(d=len(ensemble_list), n=ncpu))

        if ncpu > len(ensemble_list):
            ncpu = len(ensemble_list)

        jobs = []
        output_queue = multiprocessing.Queue()

        for ensemble_chunk in chunker(ensemble_list, ncpu):
            p = multiprocessing.Process(target=evaluate,
                                        args=(molecules, ensemble_chunk, sort_order, options, output_queue))
            jobs.append(p)
            p.start()

        # collect results into a dictionary
        results = {}
        for i in range(len(jobs)):
            results.update(output_queue.get())

        # stop jobs
        for j in jobs:
            j.join()

    else:
        print("Determining the performance of {d} ensembles using {n} processor".format(d=len(ensemble_list), n=ncpu))
        results = evaluate(molecules, ensemble_list, sort_order, options)

    # peel away the best performing ensemble
    ensemble = screener.find_best_ensemble(results, options)

    # write out the best performing ensemble
    output.write_ensemble(list(ensemble), options)

    # temp 2/9/15 generate and return a list of auc values and ef at fpf = 0.001 to build up a histogram
    nd = max([results[x].ef.keys() for x in results.keys()][0])
    n = int(round(0.001 * nd))
    ef_list = [results[x].get_prop(n, 'ef') for x in results.keys()]
    auc_list = [results[x].get_prop('auc') for x in results.keys()]
    # auc_list = [[results[x].get_prop('auc'), results[x].get_prop('ensemble')] for x in results.keys()]
    return auc_list, ef_list
Exemplo n.º 2
0
def optimizor(molecules, sort_order, ensemble_size, options):
    """
	Evaluate the performance of all ensembles of fixed size.
	"""
    # set variables
    ncpu = options.ncpu
    score_field = options.score_field

    # generate an exhaustive list of all possible ensembles
    ensemble_list = make_ensemble_list(molecules, score_field, ensemble_size)

    # set number of processors.
    if not ncpu:
        ncpu = multiprocessing.cpu_count()

    if ncpu > 1:
        print(("Determining the performance of {d} ensembles using {n} processors".format(d=len(ensemble_list), n=ncpu)))

        if ncpu > len(ensemble_list):
            ncpu = len(ensemble_list)

        jobs = []
        output_queue = multiprocessing.Queue()

        for ensemble_chunk in chunker(ensemble_list, ncpu):
            p = multiprocessing.Process(target=evaluate,
                                        args=(molecules, ensemble_chunk, sort_order, options, output_queue))
            jobs.append(p)
            p.start()

        # collect results into a dictionary
        results = {}
        for i in range(len(jobs)):
            results.update(output_queue.get())

        # stop jobs
        for j in jobs:
            j.join()

    else:
        print(("Determining the performance of {d} ensembles using {n} processor".format(d=len(ensemble_list), n=ncpu)))
        results = evaluate(molecules, ensemble_list, sort_order, options)

    # peel away the best performing ensemble
    ensemble = screener.find_best_ensemble(results, options)

    # write out the best performing ensemble
    output.write_ensemble(list(ensemble), options)

    # temp 2/9/15 generate and return a list of auc values and ef at fpf = 0.001 to build up a histogram
    nd = max([list(results[x].ef.keys()) for x in list(results.keys())][0])
    n = int(round(0.001 * nd))
    ef_list = [results[x].get_prop(n, 'ef') for x in list(results.keys())]
    auc_list = [results[x].get_prop('auc') for x in list(results.keys())]
    # auc_list = [[results[x].get_prop('auc'), results[x].get_prop('ensemble')] for x in results.keys()]
    return auc_list, ef_list
Exemplo n.º 3
0
def rank_queries(molecules, ensemble, sort_order, options):
    """
    rank queries by value added to existing ensemble
    :param molecules:
    :param score_field:
    :param ensemble:
    :param sort_order:
    :param options:
    :return:
    """

    # generate query list
    query_list = [
        x for x in list(molecules[0].scores.keys()) if x not in ensemble
    ]

    results = {}

    for query in query_list:
        es = EnsembleStorage()  # an ensemble storage project

        # generate test_ensemble
        test_ensemble = ensemble[0:]
        test_ensemble.append(query)
        test_ensemble = tuple(test_ensemble)
        es.set_prop('ensemble', test_ensemble)

        # calculate its performance
        score_structure = classification.make_score_structure(
            molecules, test_ensemble)

        # determine auc value
        auc_structure = classification.make_auc_structure(score_structure)
        auc = classification.calculate_auc(auc_structure, sort_order,
                                           'no stats')
        es.set_prop('auc', auc)

        # if the enrichment factor was set to anything other than 1, then we're training to maximize the corresponding
        # enrichment factor
        for fpf in classification.make_fpfList(options, score_structure):
            fpf = float(fpf)
            ef_structure = classification.make_ef_structure(
                score_structure, fpf, sort_order)
            if ef_structure:
                ef = classification.calculate_ef(ef_structure, fpf)
                es.set_prop(ef[0], ef[1], 'ef')

        # append results to metric list
        results[test_ensemble] = es

    # peel away the best performing ensemble
    best_ensemble = screener.find_best_ensemble(results, options)

    return list(best_ensemble)
Exemplo n.º 4
0
def construct_ensemble(results, ensemble, options):
    """

    :param results: {'(query)' : EB.builder.utilities.ensemble_storage.EnsembleStorage object}
    :param ensemble:
    :param options:
    :return:
    """
    best_query = screener.find_best_ensemble(results, options)
    ensemble.append(best_query[0])
    del results[best_query]
    return list(ensemble)
Exemplo n.º 5
0
def rank_queries(molecules, ensemble, sort_order, options):
    """
    rank queries by value added to existing ensemble
    :param molecules:
    :param score_field:
    :param ensemble:
    :param sort_order:
    :param options:
    :return:
    """

    # generate query list
    query_list = [x for x in list(molecules[0].scores.keys()) if x not in ensemble]

    results = {}

    for query in query_list:
        es = EnsembleStorage() # an ensemble storage project

        # generate test_ensemble
        test_ensemble = ensemble[0:]
        test_ensemble.append(query)
        test_ensemble = tuple(test_ensemble)
        es.set_prop('ensemble', test_ensemble)

        # calculate its performance
        score_structure = classification.make_score_structure(molecules, test_ensemble)

        # determine auc value
        auc_structure = classification.make_auc_structure(score_structure)
        auc = classification.calculate_auc(auc_structure, sort_order, 'no stats')
        es.set_prop('auc', auc)

        # if the enrichment factor was set to anything other than 1, then we're training to maximize the corresponding
        # enrichment factor
        for fpf in classification.make_fpfList(options, score_structure):
            fpf = float(fpf)
            ef_structure = classification.make_ef_structure(score_structure, fpf, sort_order)
            if ef_structure:
                ef = classification.calculate_ef(ef_structure, fpf)
                es.set_prop(ef[0], ef[1], 'ef')

        # append results to metric list
        results[test_ensemble] = es

    # peel away the best performing ensemble
    best_ensemble = screener.find_best_ensemble(results, options)

    return list(best_ensemble)