예제 #1
0
def symbolic_features(p_x, p_y):
    """
    Funcion para crear regresores no lineales

    Parameters
    ----------
    p_x: pd.DataFrame
        with regressors or predictor variables

    p_y: pd.DataFrame with variable to predict

    Returns
    -------
    results: model

    """
    model = SymbolicTransformer(function_set=[
        "sub", "add", 'inv', 'mul', 'div', 'abs', 'log', "max", "min", "sin",
        "cos"
    ],
                                population_size=5000,
                                hall_of_fame=100,
                                n_components=20,
                                generations=20,
                                tournament_size=20,
                                stopping_criteria=.05,
                                const_range=None,
                                init_depth=(4, 12),
                                metric='pearson',
                                parsimony_coefficient=0.001,
                                p_crossover=0.4,
                                p_subtree_mutation=0.2,
                                p_hoist_mutation=0.1,
                                p_point_mutation=0.3,
                                p_point_replace=.05,
                                verbose=1,
                                random_state=None,
                                n_jobs=-1,
                                feature_names=p_x.columns,
                                warm_start=True)

    init = model.fit_transform(p_x[:'01-01-2019'], p_y[:'01-01-2019'])
    model_params = model.get_params()
    gp_features = model.transform(p_x)
    model_fit = np.hstack((p_x, gp_features))
    results = {'fit': model_fit, 'params': model_params, 'model': model}

    return results
예제 #2
0
def symbolic_features(p_x, p_y):
    """
    Funcion para crear regresores no lineales

    Parameters
    ----------
    p_x: pd.DataFrame
        with regressors or predictor variables
        p_x = data_features.iloc[0:30, 3:]

    p_y: pd.DataFrame
        with variable to predict
        p_y = data_features.iloc[0:30, 1]

    Returns
    -------
    score_gp: float
        error of prediction

    """

    # funcion de generacion de variables simbolicas
    model = SymbolicTransformer(
        function_set=["sub", "add", 'inv', 'mul', 'div', 'abs', 'log'],
        population_size=12000,
        hall_of_fame=300,
        n_components=30,
        generations=4,
        tournament_size=600,
        stopping_criteria=.75,
        const_range=None,
        init_method='half and half',
        init_depth=(4, 20),
        metric='pearson',
        parsimony_coefficient=0.001,
        p_crossover=0.4,
        p_subtree_mutation=0.3,
        p_hoist_mutation=0.1,
        p_point_mutation=0.2,
        p_point_replace=0.2,
        verbose=1,
        random_state=None,
        n_jobs=-1,
        feature_names=p_x.columns,
        warm_start=True)

    # SymbolicTransformer fit
    model_fit = model.fit_transform(p_x, p_y)

    # output data of the model
    data = pd.DataFrame(np.round(model_fit, 6))

    # parameters of the model
    model_params = model.get_params()

    # best programs dataframe
    best_programs = {}
    for p in model._best_programs:
        factor_name = 'sym_' + str(model._best_programs.index(p))
        best_programs[factor_name] = {
            'raw_fitness': p.raw_fitness_,
            'reg_fitness': p.fitness_,
            'expression': str(p),
            'depth': p.depth_,
            'length': p.length_
        }

    # formatting, drop duplicates and sort by reg_fitness
    best_programs = pd.DataFrame(best_programs).T
    best_programs = best_programs.drop_duplicates(subset=['expression'])
    best_programs = best_programs.sort_values(by='reg_fitness',
                                              ascending=False)

    # results
    results = {
        'fit': model_fit,
        'params': model_params,
        'model': model,
        'data': data,
        'best_programs': best_programs,
        'details': model.run_details_
    }

    return results
예제 #3
0
def symbolic_features(p_x, p_y, p_params):
    """
    Feature engineering process with symbolic variables by using genetic programming.
    Parameters
    ----------
    p_x: pd.DataFrame / np.array / list
        with regressors or predictor variables
        p_x = data_features.iloc[:, 1:]
    p_y: pd.DataFrame / np.array / list
        with variable to predict
        p_y = data_features.iloc[:, 0]
    p_params: dict
        with parameters for the genetic programming function
        p_params = {'functions': ["sub", "add", 'inv', 'mul', 'div', 'abs', 'log'],
        'population': 5000, 'tournament':20, 'hof': 20, 'generations': 5, 'n_features':20,
        'init_depth': (4,8), 'init_method': 'half and half', 'parsimony': 0.1, 'constants': None,
        'metric': 'pearson', 'metric_goal': 0.65,
        'prob_cross': 0.4, 'prob_mutation_subtree': 0.3,
        'prob_mutation_hoist': 0.1. 'prob_mutation_point': 0.2,
        'verbose': True, 'random_cv': None, 'parallelization': True, 'warm_start': True }
    Returns
    -------
    results: dict
        With response information
        {'fit': model fitted, 'params': model parameters, 'model': model,
         'data': generated data with variables, 'best_programs': models best programs}
    References
    ----------
    https://gplearn.readthedocs.io/en/stable/reference.html#gplearn.genetic.SymbolicTransformer


    **** NOTE ****
    simplified internal calculation for correlation (asuming w=1)

    y_pred_demean = y_pred - np.average(y_pred)
    y_demean = y - np.average(y)
                              np.sum(y_pred_demean * y_demean)
    pearson =  ---------------------------------------------------------------
                np.sqrt((np.sum(y_pred_demean ** 2) * np.sum(y_demean ** 2)))
    """

    # Function to produce Symbolic Features
    model = SymbolicTransformer(
        function_set=p_params['functions'],
        population_size=p_params['population'],
        tournament_size=p_params['tournament'],
        hall_of_fame=p_params['hof'],
        generations=p_params['generations'],
        n_components=p_params['n_features'],
        init_depth=p_params['init_depth'],
        init_method=p_params['init_method'],
        parsimony_coefficient=p_params['parsimony'],
        const_range=p_params['constants'],
        metric=p_params['metric'],
        stopping_criteria=p_params['metric_goal'],
        p_crossover=p_params['prob_cross'],
        p_subtree_mutation=p_params['prob_mutation_subtree'],
        p_hoist_mutation=p_params['prob_mutation_hoist'],
        p_point_mutation=p_params['prob_mutation_point'],
        max_samples=p_params['max_samples'],
        verbose=p_params['verbose'],
        warm_start=p_params['warm_start'],
        random_state=123,
        n_jobs=-1 if p_params['parallelization'] else 1,
        feature_names=p_x.columns)

    # SymbolicTransformer fit
    model_fit = model.fit_transform(p_x, p_y)

    # output data of the model
    data = pd.DataFrame(model_fit)

    # parameters of the model
    model_params = model.get_params()

    # best programs dataframe
    best_programs = {}
    for p in model._best_programs:
        factor_name = 'sym' + str(model._best_programs.index(p))
        best_programs[factor_name] = {
            'raw_fitness': p.raw_fitness_,
            'reg_fitness': p.fitness_,
            'expression': str(p),
            'depth': p.depth_,
            'length': p.length_
        }

    # format and sorting
    best_programs = pd.DataFrame(best_programs).T
    best_programs = best_programs.sort_values(by='raw_fitness',
                                              ascending=False)

    # results
    results = {
        'fit': model_fit,
        'params': model_params,
        'model': model,
        'data': data,
        'best_programs': best_programs,
        'details': model.run_details_
    }

    return results
def symbolic_features(p_x, p_y):
    """
    Funcion para crear regresores no lineales

    Parameters
    ----------
    p_x: pd.DataFrame
        with regressors or predictor variables
        p_x = data_features.iloc[0:30, 3:]

    p_y: pd.DataFrame
        with variable to predict
        p_y = data_features.iloc[0:30, 1]

    Returns
    -------
    score_gp: float
        error of prediction

    """
    model = SymbolicTransformer(
        function_set=["sub", "add", 'inv', 'mul', 'div', 'abs', 'log'],
        population_size=5000,
        hall_of_fame=20,
        n_components=10,
        tournament_size=20,
        generations=5,
        init_depth=(4, 8),
        init_method='half and half',
        parsimony_coefficient=0.1,
        const_range=None,
        metric='pearson',
        stopping_criteria=0.65,
        p_crossover=0.4,
        p_subtree_mutation=0.3,
        p_hoist_mutation=0.1,
        p_point_mutation=0.2,
        verbose=True,
        warm_start=True,
        n_jobs=-1,
        feature_names=p_x.columns)
    model.fit_transform(p_x, p_y)
    model_params = model.get_params()
    gp_features = model.transform(p_x)

    model_fit = np.hstack((p_x, gp_features))
    results = {
        'fit': model_fit,
        'params': model_params,
        'model': model,
        "features": gp_features
    }
    best_p = model._best_programs
    best_p_dict = {}

    for p in best_p:
        factor_name = 'alpha_' + str(best_p.index(p) + 1)
        best_p_dict[factor_name] = {
            'fitness': p.fitness_,
            "expression": str(p),
            'depth': p.depth_,
            "length": p.length_
        }

    best_p_dict = pd.DataFrame(best_p_dict).T
    best_p_dict = best_p_dict.sort_values(by="fitness")

    return results, best_p_dict