示例#1
0
def evaluateIndividualCorrelation(individual,
                                  afs_tool):  # evaluation function for DEAP
    debug_pr(
        "Getting feature {}...".format(
            utils.string_replace_using_dict(str(individual),
                                            afs_tool.arg_dict)), 4)
    col = afs_tool.get_feature_col(individual)
    toc(debug_level=4)
    debug_pr("Computing correlation...".format(str(individual)), 4)
    if (type(col) is np.ndarray):
        type_ = type(col[0])
    else:
        type_ = type(col.values[0])
    if issubclass(type_, np.number):
        corr_coef = compute_correlation(col, afs_tool.target)
    elif issubclass(type_, bool) or issubclass(type_, np.bool_):
        col = col.astype(float)
        corr_coef = compute_correlation(col, afs_tool.target)
    else:  # issubclass(type, str):
        if (col.nunique() >
                16):  # TODO: parameterize the maximum number of columns
            return (0., )

        ohe_df = pd.get_dummies(col)

        # Taking the maximum correlation coefficient among OHE columns
        corr_coef = -1
        for col_key in ohe_df:
            temp = compute_correlation(ohe_df[col_key], afs_tool.target)
            if (temp > corr_coef):
                corr_coef = temp
    return (corr_coef, )
    toc(debug_level=4)
示例#2
0
def evaluateCorrelation(population, afs_tool, comparison_set='bfs'):
    """
    Method for evaluation population fitness using correlation with the target variable
    
    @param population: population to evaluate
    @param afs_tool: AFSTool object (holds the data for the given population)
    @param comparison_set: collection of individuals to compare for correlation penalty calculations
        None: no comparison, compute a univariate correlation
        'bfs': compare to current Best Feature Set
        'hof': compare to the current Hall of Fame

    Return number of .apply operations needed (params population and afs_tool are modified)
    
    """
    evals_before = afs_tool.eval_counter
    debug_pr("Starting population evaluate.", 3)
    if comparison_set is None:
        fitnesses = map(lambda x: evaluateIndividualCorrelation(x, afs_tool),
                        population)
    else:
        fitnesses = map(
            lambda x: evaluateIndividualDiversityScore(
                x, afs_tool, comparison_set), population)
    for ind, fit in zip(population, fitnesses):
        ind.fitness.values = fit
    evals_after = afs_tool.eval_counter

    return evals_after - evals_before
示例#3
0
def evaluateModelWrapper(population, afs_tool, classifier):
    """
    Method for evaluation population fitness using feature importance from a model
    
    Parameters
    ----------
    population: population to evaluate
    afs_tool: AFSTool object (holds the data for the given population)
    classifier: sklearn classifier (i.e. random forest)

    Returns
    -------
    Return number of .apply operations needed (params population and afs_tool are modified)
    
    """

    evals_before = afs_tool.eval_counter

    wrapper = ModelValidationWrapper(classifier)
    data_frame = afs_tool.get_feature_set_dataframe(population)
    wrapper.train(data_frame, afs_tool.target, pos_label=1)

    fitnesses = wrapper.cross_validation_output.\
                    feature_importance_summary.coef_abs.values.tolist()

    fitness_dict = dict()
    for col, fit in zip(data_frame, fitnesses):
        fitness_dict[col] = fit

    for ind in population:
        key = str(ind)
        if (key in fitness_dict):
            fit_tuple = (fitness_dict[key], )
        else:
            fit_tuple = (0.0, )
            debug_pr(
                'WARNING: feature {} did not have a fitness score.'.format(
                    key), 1)
        ind.fitness.values = fit_tuple
    evals_after = afs_tool.eval_counter

    return evals_after - evals_before
示例#4
0
def cxJoinWithRandomOp(ind1, ind2, pset):
    """Combine two subtrees.
    :param ind1: First tree participating in the crossover.
    :param ind2: Second tree participating in the crossover.
    :returns: A tuple of one tree. If ind1 and ind2 can be combined
     by a valid operator, that result is returned. If they cannot be
     combined, ind1 is returned.
    """

    #TODO if commutative, swap (if necessary) to make ind1 come first lexographically

    index = 0  # the root index
    for idx, node in enumerate(ind1[index:(index + 1)], 1):
        node1 = node
    for idx, node in enumerate(ind2[index:(index + 1)], 1):
        node2 = node
    choice = random.choice
    type_ = pset.ret
    if type(type_) is list:
        primitiveList = []
        for type_element in type_:
            for primitive in pset.primitives[type_element]:
                primitiveList.append(primitive)
    else:
        primitiveList = pset.primitives[type_]
    primitives = [
        p for p in primitiveList
        if (p.arity == 2 and node1.ret is p.args[0] and node2.ret is p.args[1])
    ]
    if len(primitives) == 0:
        debug_pr(
            'No match for argument list ' + str(node1.ret) + ', ' +
            str(node2.ret), 3)
        return ind1, ind1
    new_node = choice(primitives)
    try:
        new_string1 = new_node.format(ind1, ind2)
        Individual = type(ind1)
        new_individual1 = Individual(
            gp.PrimitiveTree.from_string(new_string1, pset))
        new_string2 = new_node.format(ind1, ind2)
        new_individual2 = Individual(
            gp.PrimitiveTree.from_string(new_string2, pset))
        return new_individual1, new_individual2
    except TypeError as inst:
        debug_pr("Exception creating gp.PrimitiveTree from String!", 1)
        debug_pr(inst, 1)
        return ind1, ind1
示例#5
0
def evaluateIndividualDiversityScore(
        individual,
        afs_tool,
        comparison_set_str='bfs'):  # evaluation function for DEAP
    '''
    Finds a "diversity score" based on the difference
     (correlation with target) - (correlation with most similar HOF feature)
    '''

    if type(comparison_set_str) is str:
        if comparison_set_str == 'bfs':
            comparison_set = afs_tool.bfs
        elif comparison_set_str == 'hof':
            comparison_set = afs_tool.hof
        else:
            raise ValueError(
                'comparison_set_str value must be \'bfs\' or \'hof\'.')
    else:
        raise TypeError('comparison_set_str must be a string')
    comparison_set = list(comparison_set)
    debug_pr("Getting feature {}...".format(str(individual)), 4)

    # tic()

    corr_coef = get_feature_correlation(individual, afs_tool.target_name,
                                        afs_tool)

    # if individual in comparison_set or len(comparison_set)<1:
    if len(comparison_set) < 1:
        score = corr_coef
    else:
        most_similar_individual = None
        most_similar_comparison_indexx = 0
        highest_corr = -1
        for comparison_index in range(len(comparison_set)):
            comparison_indiv = comparison_set[comparison_index]
            if (equals(individual, comparison_indiv)):
                continue
            corr = get_feature_correlation(individual, comparison_indiv,
                                           afs_tool)
            if corr > highest_corr:
                highest_corr = corr
                most_similar_individual = comparison_indiv
                most_similar_comparison_indexx = comparison_index
        if most_similar_individual is None:
            score = corr_coef
        else:
            similar_feature_target_corr = get_feature_correlation(
                most_similar_individual, afs_tool.target_name, afs_tool)
            if (comparison_set_str is 'hof'):
                if corr_coef > similar_feature_target_corr and not (
                        individual in afs_tool.hof):
                    afs_tool.hof.remove(most_similar_comparison_indexx)
                    afs_tool.hof.insert(individual)
                    most_similar_individual.fitness.values = ((
                        similar_feature_target_corr - highest_corr), )
                    score = corr_coef
                else:
                    score = corr_coef - highest_corr
            elif corr_coef > similar_feature_target_corr:
                score = corr_coef
            else:
                score = corr_coef - highest_corr
    # print(score)
    # toc()
    return (score, )