Exemplo n.º 1
0
def run_cross_validation(settings, targets, classifiers, pipelines):
    print 'Cross-validation task'
    print 'Targets', ', '.join(targets)
    print 'Pipelines:\n ', '\n  '.join([p.get_name() for p in pipelines])
    print 'Classifiers', ', '.join([c[1] for c in classifiers])

    run_prepare_data_for_cross_validation(settings, targets, pipelines)

    # run on pool first, then show results after
    pool = Pool(settings.N_jobs)
    for i, pipeline in enumerate(pipelines):
        for j, (classifier, classifier_name) in enumerate(classifiers):
            for k, target in enumerate(targets):
                progress_str = 'P=%d/%d C=%d/%d T=%d/%d' % (i+1, len(pipelines), j+1, len(classifiers), k+1, len(targets))
                cross_validation_score(settings, target, pipeline, classifier, classifier_name,
                    strategy=cross_validation_strategy, pool=pool, progress_str=progress_str, return_data=False, quiet=True)
    pool.close()
    pool.join()

    summaries = []
    best = {}
    for p_num, pipeline in enumerate(pipelines):
        for c_num, (classifier, classifier_name) in enumerate(classifiers):
            mean_scores = []
            median_scores = []
            datas = []
            for target in targets:
                print 'Running %s pipeline %s classifier %s' % (target, pipeline.get_name(), classifier_name)
                data = cross_validation_score(settings, target, pipeline, classifier, classifier_name,
                    strategy=cross_validation_strategy, quiet=True)
                datas.append(data)
                if data.mean_score != data.median_score:
                    print '%.3f (mean)' % data.mean_score, data.mean_scores
                    print '%.3f (median)' % data.median_score, data.median_scores
                else:
                    print '%.3f' % data.mean_score
                mean_scores.append(data.mean_score)
                median_scores.append(data.median_score)

                best_score = best.get(target, [0, None, None, None])[0]
                cur_score = max(data.mean_score, data.median_score)
                if cur_score > best_score:
                    best[target] = [cur_score, pipeline, classifier, classifier_name]

            name = 'p=%d c=%d %s mean %s' % (p_num, c_num, classifier_name, pipeline.get_name())
            summary = get_score_summary(name, mean_scores)
            summaries.append((summary, np.mean(mean_scores)))
            print summary
            name = 'p=%d c=%d %s median %s' % (p_num, c_num, classifier_name, pipeline.get_name())
            summary = get_score_summary(name, median_scores)
            summaries.append((summary, np.mean(median_scores)))
            print summary

    print_results(summaries)

    print '\nbest'
    for target in targets:
        pipeline = best[target][1]
        classifier_name = best[target][3]
        print target, best[target][0], classifier_name, pipeline.get_names()
Exemplo n.º 2
0
def run_cross_validation(settings, targets, pipelines, mask_range, split_ratios, classifiers):
    pool = Pool(settings.N_jobs)
    for i, pipeline in enumerate(pipelines):
        for j, (classifier, classifier_name) in enumerate(classifiers):
            for k, target in enumerate(targets):
                pool.apply_async(cross_validation_score, [settings, target, pipeline, classifier, classifier_name], {'quiet': True})
                for split_num, split_ratio in enumerate(split_ratios):
                    masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True)
                    for mask_num, mask in enumerate(masks):
                        progress_str = 'P=%d/%d C=%d/%d T=%d/%d S=%d/%d M=%d/%d' % (i+1, len(pipelines), j+1, len(classifiers), k+1, len(targets), split_num+1, len(split_ratios), mask_num+1, len(masks))
                        cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, quiet=True, return_data=False, pool=pool, progress_str=progress_str)
    pool.close()
    pool.join()
    print 'Finished cross validation mp'

    summaries = []
    for p_num, pipeline in enumerate(pipelines):
        for classifier, classifier_name in classifiers:
            scores_full = []
            scores_masked = [[[] for y in mask_range] for x in split_ratios]
            for i, target in enumerate(targets):
                run_prepare_data_for_cross_validation(settings, [target], [pipeline], quiet=True)
                data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, pool=None, quiet=True)
                scores_full.append(data.mean_score)

                for split_index, split_ratio in enumerate(split_ratios):
                    masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True)
                    for mask_index, num_masks in enumerate(mask_range):
                        predictions = []
                        y_cvs = None
                        for mask in masks[0:num_masks]:
                            data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, pool=None, quiet=True)
                            predictions.append(data.mean_predictions)
                            if y_cvs is None:
                                y_cvs = data.y_cvs
                            else:
                                for y_cv_1, y_cv_2 in zip(y_cvs, data.y_cvs):
                                    assert np.alltrue(y_cv_1 == y_cv_2)

                        predictions = np.mean(predictions, axis=0)
                        scores = [roc_auc_score(y_cv, p) for p, y_cv in zip(predictions, y_cvs)]
                        score = np.mean(scores)
                        scores_masked[split_index][mask_index].append(score)

            summary = get_score_summary('%s p=%d full' % (classifier_name, p_num), scores_full, np.mean(scores_full), targets)
            summaries.append((summary, np.mean(scores_full)))
            for split_index, split_ratio in enumerate(split_ratios):
                for mask_index, num_masks in enumerate(mask_range):
                    scores = scores_masked[split_index][mask_index]
                    summary = get_score_summary('%s p=%d split_ratio=%s masks=%d' % (classifier_name, p_num, split_ratio, num_masks), scores, np.mean(scores), targets)
                    summaries.append((summary, np.mean(scores)))
                    print summary

    print_results(summaries)
Exemplo n.º 3
0
def run_cross_validation(settings, targets, pipelines, mask_range, split_ratios, classifiers):
    pool = Pool(settings.N_jobs)
    for i, pipeline in enumerate(pipelines):
        for j, (classifier, classifier_name) in enumerate(classifiers):
            for k, target in enumerate(targets):
                pool.apply_async(cross_validation_score, [settings, target, pipeline, classifier, classifier_name], {'quiet': True})
                for split_num, split_ratio in enumerate(split_ratios):
                    masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True)
                    for mask_num, mask in enumerate(masks):
                        progress_str = 'P=%d/%d C=%d/%d T=%d/%d S=%d/%d M=%d/%d' % (i+1, len(pipelines), j+1, len(classifiers), k+1, len(targets), split_num+1, len(split_ratios), mask_num+1, len(masks))
                        cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, quiet=True, return_data=False, pool=pool, progress_str=progress_str)
    pool.close()
    pool.join()
    print('Finished cross validation mp')

    summaries = []
    for p_num, pipeline in enumerate(pipelines):
        for classifier, classifier_name in classifiers:
            scores_full = []
            scores_masked = [[[] for y in mask_range] for x in split_ratios]
            for i, target in enumerate(targets):
                run_prepare_data_for_cross_validation(settings, [target], [pipeline], quiet=True)
                data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, pool=None, quiet=True)
                scores_full.append(data.mean_score)

                for split_index, split_ratio in enumerate(split_ratios):
                    masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True)
                    for mask_index, num_masks in enumerate(mask_range):
                        predictions = []
                        y_cvs = None
                        for mask in masks[0:num_masks]:
                            data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, pool=None, quiet=True)
                            predictions.append(data.mean_predictions)
                            if y_cvs is None:
                                y_cvs = data.y_cvs
                            else:
                                for y_cv_1, y_cv_2 in zip(y_cvs, data.y_cvs):
                                    assert np.alltrue(y_cv_1 == y_cv_2)

                        predictions = np.mean(predictions, axis=0)
                        scores = [roc_auc_score(y_cv, p) for p, y_cv in zip(predictions, y_cvs)]
                        score = np.mean(scores)
                        scores_masked[split_index][mask_index].append(score)

            summary = get_score_summary('%s p=%d full' % (classifier_name, p_num), scores_full)
            summaries.append((summary, np.mean(scores_full)))
            for split_index, split_ratio in enumerate(split_ratios):
                for mask_index, num_masks in enumerate(mask_range):
                    scores = scores_masked[split_index][mask_index]
                    summary = get_score_summary('%s p=%d split_ratio=%s masks=%d' % (classifier_name, p_num, split_ratio, num_masks), scores)
                    summaries.append((summary, np.mean(scores)))
                    print(summary)

    print_results(summaries)
Exemplo n.º 4
0
def evaluate_fitness_score(settings, target, pipeline, classifier, classifier_name, quiet, arg):
    individual, best_score = arg
    if np.sum(individual) == 0:
        score = 0.0
    else:
        score = float(cross_validation_score(settings, target, pipeline, classifier, classifier_name,
            strategy=cross_validation_strategy, feature_mask=individual, quiet=True).mean_score)

    if score > best_score:
        if not quiet: print score, np.sum(individual)
    return score,
Exemplo n.º 5
0
def evaluate_fitness_score(settings, target, pipeline, classifier,
                           classifier_name, quiet, arg):
    individual, best_score = arg
    if np.sum(individual) == 0:
        score = 0.0
    else:
        score = float(
            cross_validation_score(settings,
                                   target,
                                   pipeline,
                                   classifier,
                                   classifier_name,
                                   strategy=cross_validation_strategy,
                                   feature_mask=individual,
                                   quiet=True).mean_score)

    if score > best_score:
        if not quiet: print score, np.sum(individual)
    return score,
Exemplo n.º 6
0
def process_target(settings,
                   target,
                   pipeline,
                   classifier,
                   classifier_name,
                   ratio,
                   ngen,
                   quiet,
                   threshold=400):
    # make results repeatable
    random.seed(0)

    num_features, num_training_examples = get_pipeline_data(
        settings, target, pipeline)

    # Using sub-feature selection for the human patients appears to perform worse than
    # using full feature set. My guess is that perhaps there is not enough training samples
    # for this technique to work effectively. So do not run GA if there are too few training
    # samples. The threshold parameter can be tweaked with more testing.
    if num_training_examples < threshold:
        score = float(
            cross_validation_score(settings,
                                   target,
                                   pipeline,
                                   classifier,
                                   classifier_name,
                                   strategy=cross_validation_strategy,
                                   quiet=True).mean_score)
        return score, [[1] * num_features]

    num_wanted_features = int(num_features * ratio)
    if not quiet: print 'ratio', ratio
    if not quiet: print 'num features', num_features
    if not quiet: print 'num wanted features', num_wanted_features

    if not quiet: print target, classifier_name

    pool = Pool(settings.N_jobs)

    toolbox = base.Toolbox()
    toolbox.register("map", pool.map)
    toolbox.register("attr_bool", random_bool, ratio)
    toolbox.register("individual", tools.initRepeat, creator.Individual,
                     toolbox.attr_bool, num_features)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)

    toolbox.register("evaluate", evaluate_fitness_score, settings, target,
                     pipeline, classifier, classifier_name, quiet)
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)

    pop = toolbox.population(n=30)
    CXPB, MUTPB, NGEN = 0.5, 0.2, ngen

    best_score = 0
    best_feature_mask = None
    all_feature_masks = {}

    # Evaluate the entire population
    if not quiet: print 'evaluating pop %d' % len(pop)
    fitnesses = toolbox.map(toolbox.evaluate, [(ind, 1.0) for ind in pop])
    if not quiet: print 'done evaluating'

    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
        all_feature_masks[calc_feature_mask_string(ind)] = (list(ind), fit[0])

    # calc first best
    fits = [ind.fitness.values[0] for ind in pop]
    best_index = np.argmax(fits)
    score = fits[best_index]
    if score > best_score:
        best_score = score
        best_feature_mask = pop[best_index]
        if not quiet: print 'new best', best_score, np.sum(best_feature_mask)

    # Begin the evolution
    for g in range(NGEN):
        if not quiet: print("-- %s: Generation %i --" % (target, g))

        # Select the next generation individuals
        offspring = toolbox.select(pop, int(len(pop)))
        # Clone the selected individuals
        offspring = list(toolbox.map(toolbox.clone, offspring))

        # Apply crossover and mutation on the offspring
        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if random.random() < CXPB:
                toolbox.mate(child1, child2)
                del child1.fitness.values
                del child2.fitness.values

        for mutant in offspring:
            if random.random() < MUTPB:
                toolbox.mutate(mutant)
                del mutant.fitness.values

        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = toolbox.map(toolbox.evaluate,
                                [(ind, best_score) for ind in invalid_ind])
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
            all_feature_masks[calc_feature_mask_string(ind)] = (list(ind),
                                                                fit[0])

        if not quiet:
            print("  Evaluated %i individuals (pop size %d)" %
                  (len(invalid_ind), len(offspring)))

        # The population is entirely replaced by the offspring
        pop[:] = offspring

        # Gather all the fitnesses in one list and print the stats
        fits = [ind.fitness.values[0] for ind in pop]
        best_index = np.argmax(fits)
        all_f = [np.sum(ind) for ind in pop]
        if not quiet:
            print '  %s, %s, %s (%d-%d)' % (target, fits[best_index],
                                            np.sum(pop[best_index]),
                                            np.min(all_f), np.max(all_f))

        length = len(pop)
        mean = sum(fits) / length

        if not quiet: print("  Min %s" % min(fits))
        if not quiet: print("  Max %s" % max(fits))
        if not quiet: print("  Avg %s" % mean)

        score = fits[best_index]
        if score > best_score:
            best_score = score
            best_feature_mask = pop[best_index]
            if not quiet:
                print 'new best', best_score, np.sum(best_feature_mask)

    if not quiet: print("-- End of (successful) evolution --")

    best_ind = tools.selBest(pop, 1)[0]
    if not quiet:
        print "-- Finished --\n%s\n%s\n%s" % (
            target, best_ind.fitness.values[0], best_ind)

    pop = list(all_feature_masks.values())
    pop.sort(cmp=lambda x1, x2: cmp(x2[1], x1[1]))
    sorted_pop = [ind for ind, score in pop]
    print target, 'best', pop[0][1], 'worst', pop[-1][1]

    return best_score, sorted_pop
Exemplo n.º 7
0
def process_target(settings, target, pipeline, classifier, classifier_name, ratio, ngen, quiet, threshold=400):
    # make results repeatable
    random.seed(0)

    num_features, num_training_examples = get_pipeline_data(settings, target, pipeline)

    # Using sub-feature selection for the human patients appears to perform worse than
    # using full feature set. My guess is that perhaps there is not enough training samples
    # for this technique to work effectively. So do not run GA if there are too few training
    # samples. The threshold parameter can be tweaked with more testing.
    if num_training_examples < threshold:
        score = float(cross_validation_score(settings, target, pipeline, classifier, classifier_name,
            strategy=cross_validation_strategy, quiet=True).mean_score)
        return score, [[1] * num_features]

    num_wanted_features = int(num_features * ratio)
    if not quiet: print 'ratio', ratio
    if not quiet: print 'num features', num_features
    if not quiet: print 'num wanted features', num_wanted_features

    if not quiet: print target, classifier_name

    pool = Pool(settings.N_jobs)

    toolbox = base.Toolbox()
    toolbox.register("map", pool.map)
    toolbox.register("attr_bool", random_bool, ratio)
    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, num_features)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)

    toolbox.register("evaluate", evaluate_fitness_score, settings, target, pipeline, classifier, classifier_name, quiet)
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)

    pop = toolbox.population(n=30)
    CXPB, MUTPB, NGEN = 0.5, 0.2, ngen

    best_score = 0
    best_feature_mask = None
    all_feature_masks = {}

    # Evaluate the entire population
    if not quiet: print 'evaluating pop %d' % len(pop)
    fitnesses = toolbox.map(toolbox.evaluate, [(ind, 1.0) for ind in pop])
    if not quiet: print 'done evaluating'

    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
        all_feature_masks[calc_feature_mask_string(ind)] = (list(ind), fit[0])

    # calc first best
    fits = [ind.fitness.values[0] for ind in pop]
    best_index = np.argmax(fits)
    score = fits[best_index]
    if score > best_score:
        best_score = score
        best_feature_mask = pop[best_index]
        if not quiet: print 'new best', best_score, np.sum(best_feature_mask)

    # Begin the evolution
    for g in range(NGEN):
        if not quiet: print("-- %s: Generation %i --" % (target, g))

        # Select the next generation individuals
        offspring = toolbox.select(pop, int(len(pop)))
        # Clone the selected individuals
        offspring = list(toolbox.map(toolbox.clone, offspring))

        # Apply crossover and mutation on the offspring
        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if random.random() < CXPB:
                toolbox.mate(child1, child2)
                del child1.fitness.values
                del child2.fitness.values

        for mutant in offspring:
            if random.random() < MUTPB:
                toolbox.mutate(mutant)
                del mutant.fitness.values

        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = toolbox.map(toolbox.evaluate, [(ind, best_score) for ind in invalid_ind])
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
            all_feature_masks[calc_feature_mask_string(ind)] = (list(ind), fit[0])

        if not quiet: print("  Evaluated %i individuals (pop size %d)" % (len(invalid_ind), len(offspring)))

        # The population is entirely replaced by the offspring
        pop[:] = offspring

        # Gather all the fitnesses in one list and print the stats
        fits = [ind.fitness.values[0] for ind in pop]
        best_index = np.argmax(fits)
        all_f = [np.sum(ind) for ind in pop]
        if not quiet: print '  %s, %s, %s (%d-%d)' % (target, fits[best_index], np.sum(pop[best_index]), np.min(all_f), np.max(all_f))

        length = len(pop)
        mean = sum(fits) / length

        if not quiet: print("  Min %s" % min(fits))
        if not quiet: print("  Max %s" % max(fits))
        if not quiet: print("  Avg %s" % mean)

        score = fits[best_index]
        if score > best_score:
            best_score = score
            best_feature_mask = pop[best_index]
            if not quiet: print 'new best', best_score, np.sum(best_feature_mask)

    if not quiet: print("-- End of (successful) evolution --")

    best_ind = tools.selBest(pop, 1)[0]
    if not quiet: print "-- Finished --\n%s\n%s\n%s" % (target, best_ind.fitness.values[0], best_ind)

    pop = list(all_feature_masks.values())
    pop.sort(cmp=lambda x1, x2: cmp(x2[1], x1[1]))
    sorted_pop = [ind for ind, score in pop]
    print target, 'best', pop[0][1], 'worst', pop[-1][1]

    return best_score, sorted_pop
Exemplo n.º 8
0
def run_cross_validation(settings, targets, classifiers, pipelines):
    print 'Cross-validation task'
    print 'Targets', ', '.join(targets)
    print 'Pipelines:\n ', '\n  '.join([p.get_name() for p in pipelines])
    print 'Classifiers', ', '.join([c[1] for c in classifiers])

    run_prepare_data_for_cross_validation(settings, targets, pipelines)

    # run on pool first, then show results after
    pool = Pool(settings.N_jobs)
    for i, pipeline in enumerate(pipelines):
        for j, (classifier, classifier_name) in enumerate(classifiers):
            for k, target in enumerate(targets):
                progress_str = 'P=%d/%d C=%d/%d T=%d/%d' % (
                    i + 1, len(pipelines), j + 1, len(classifiers), k + 1,
                    len(targets))
                cross_validation_score(settings,
                                       target,
                                       pipeline,
                                       classifier,
                                       classifier_name,
                                       strategy=cross_validation_strategy,
                                       pool=pool,
                                       progress_str=progress_str,
                                       return_data=False,
                                       quiet=True)
    pool.close()
    pool.join()

    summaries = []
    best = {}
    for p_num, pipeline in enumerate(pipelines):
        for c_num, (classifier, classifier_name) in enumerate(classifiers):
            mean_scores = []
            median_scores = []
            datas = []
            for target in targets:
                print 'Running %s pipeline %s classifier %s' % (
                    target, pipeline.get_name(), classifier_name)
                data = cross_validation_score(
                    settings,
                    target,
                    pipeline,
                    classifier,
                    classifier_name,
                    strategy=cross_validation_strategy,
                    quiet=True)
                datas.append(data)
                if data.mean_score != data.median_score:
                    print '%.3f (mean)' % data.mean_score, data.mean_scores
                    print '%.3f (median)' % data.median_score, data.median_scores
                else:
                    print '%.3f' % data.mean_score
                mean_scores.append(data.mean_score)
                median_scores.append(data.median_score)

                best_score = best.get(target, [0, None, None, None])[0]
                cur_score = max(data.mean_score, data.median_score)
                if cur_score > best_score:
                    best[target] = [
                        cur_score, pipeline, classifier, classifier_name
                    ]

            name = 'p=%d c=%d %s mean %s' % (p_num, c_num, classifier_name,
                                             pipeline.get_name())
            summary = get_score_summary(name, mean_scores)
            summaries.append((summary, np.mean(mean_scores)))
            print summary
            name = 'p=%d c=%d %s median %s' % (p_num, c_num, classifier_name,
                                               pipeline.get_name())
            summary = get_score_summary(name, median_scores)
            summaries.append((summary, np.mean(median_scores)))
            print summary

    print_results(summaries)

    print '\nbest'
    for target in targets:
        pipeline = best[target][1]
        classifier_name = best[target][3]
        print target, best[target][0], classifier_name, pipeline.get_names()