示例#1
0
def _random_sep_change(data, init_sep = "-", after_sep = " ", percentage = 0.5 , seed = 7):
    """
    A function to randomly change the SSN data's separator. 
    
    The input data is a list.
    """
    setseed(seed)
    # generate the index for replacing separator.
    replacing_indexes = choose(range(len(data)), int(len(data)*percentage))
    
    for each_replacing_index in replacing_indexes:
        # change the ssn data's separator from init_sep to after_sep
        data[each_replacing_index] = sep_change(data[each_replacing_index], init_sep, after_sep)
        
    return data
示例#2
0
def tran_sp(
    df,
    n=None,
    var=None,
    n_maxiter=500,
    tol=1e-3,
    seed=None,
    verbose=True,
    standardize=True,
):
    r"""Compact a dataset with support points

    Arguments:
        df (DataFrame): dataset to compact
        n (int): number of samples for compacted dataset
        var (list of str): list of variables to compact, must all be numeric
        n_maxiter (int): maximum number of iterations for support point algorithm
        tol (float): convergence tolerance
        verbose (bool): print messages to the console?
        standardize (bool): standardize columns before running sp? (Restores after sp)

    Returns:
        DataFrame: dataset compacted with support points

    References:
        Mak and Joseph, "Support Points" (2018) *The Annals of Statistics*

    Examples:
        >>> import grama as gr
        >>> # Compact an existing dataset
        >>> from grama.data import df_diamonds
        >>> df_sp = gr.tran_sp(df_diamonds, n=50, var=["price", "carat"])
        >>>
        >>> # Use support points to reduce model runtime
        >>> from grama.models import make_cantilever_beam
        >>> md_beam = make_cantilever_beam()
        >>> (
        >>>     md_beam
        >>>     ## Generate input sample but don't evaluate outputs
        >>>     >> gr.ev_sample(n=1e4, df_det="nom", skip=True)
        >>>     ## Reduce to a smaller---but representative---sample
        >>>     >> gr.tf_sp(n=50)
        >>>     ## Evaluate the outputs
        >>>     >> gr.tf_md(md_beam)
        >>> )

    """
    ## Setup
    setseed(seed)
    # Handle input variables
    if var is None:
        # Select numeric columns only
        var = list(df.select_dtypes(include=[number]).columns)
        if verbose:
            print("tran_sp has selected var = {}".format(var))
    # Extract values
    Y = df[var].values
    if standardize:
        Y_mean = Y.mean(axis=0)
        Y_sd = Y.std(axis=0)
        Y = (Y - Y_mean) / Y_sd
    # Generate initial proposal points
    X0 = _perturbed_choice(Y, n)

    ## Run sp.ccp algorithm
    X, d, iter_c = _sp_cpp(X0, Y, delta=tol, iter_max=n_maxiter)
    if verbose:
        print(
            "tran_sp finished in {0:} iterations with distance criterion {1:4.3e}"
            .format(iter_c, d))
        if d > tol:
            warn(
                "Convergence tolerance not met; d = {0:4.3e} > tol = {1:4.3e}".
                format(d, tol),
                RuntimeWarning,
            )

    if standardize:
        X = X * Y_sd + Y_mean

    ## Package results
    return DataFrame(data=X, columns=var)
示例#3
0
def eval_min(
    model,
    out_min=None,
    out_geq=None,
    out_leq=None,
    out_eq=None,
    method="SLSQP",
    tol=1e-6,
    n_restart=1,
    n_maxiter=50,
    seed=None,
    df_start=None,
):
    r"""Constrained minimization using functions from a model

    Perform constrained minimization using functions from a model. Model must
    have deterministic variables only.

    Wrapper for scipy.optimize.minimize

    Args:
        model (gr.Model): Model to analyze. All model variables must be
            deterministic.
        out_min (str): Output to use as minimization objective.
        out_geq (None OR list of str): Outputs to use as geq constraints; out >= 0
        out_leq (None OR list of str): Outputs to use as leq constraints; out <= 0
        out_eq (None OR list of str): Outputs to use as equality constraints; out == 0

        method (str): Optimization method; see the documentation for
            scipy.optimize.minimize for options.
        tol (float): Optimization objective convergence tolerance
        n_restart (int): Number of restarts; beyond n_restart=1 random
            restarts are used.
        df_start (None or DataFrame): Specific starting values to use; overrides
            n_restart if non None provided.

    Returns:
        DataFrame: Results of optimization

    Examples:
        >>> import grama as gr
        >>> md = (
        >>>     gr.Model("Constrained Rosenbrock")
        >>>     >> gr.cp_function(
        >>>         fun=lambda x: (1 - x[0])**2 + 100*(x[1] - x[0]**2)**2,
        >>>         var=["x", "y"],
        >>>         out=["c"],
        >>>     )
        >>>     >> gr.cp_function(
        >>>         fun=lambda x: (x[0] - 1)**3 - x[1] + 1,
        >>>         var=["x", "y"],
        >>>         out=["g1"],
        >>>     )
        >>>     >> gr.cp_function(
        >>>         fun=lambda x: x[0] + x[1] - 2,
        >>>         var=["x", "y"],
        >>>         out=["g2"],
        >>>     )
        >>>     >> gr.cp_bounds(
        >>>         x=(-1.5, +1.5),
        >>>         y=(-0.5, +2.5),
        >>>     )
        >>> )
        >>> md >> gr.ev_min(
        >>>     out_min="c",
        >>>     out_leq=["g1", "g2"]
        >>> )

    """
    ## Check that model has only deterministic variables
    if model.n_var_rand > 0:
        raise ValueError("model must have no random variables")
    ## Check that objective is in model
    if not (out_min in model.out):
        raise ValueError("model must contain out_min")
    ## Check that constraints are in model
    if not (out_geq is None):
        out_diff = set(out_geq).difference(set(model.out))
        if len(out_diff) > 0:
            raise ValueError(
                "model must contain each out_geq; missing {}".format(out_diff))
    if not (out_leq is None):
        out_diff = set(out_leq).difference(set(model.out))
        if len(out_diff) > 0:
            raise ValueError(
                "model must contain each out_leq; missing {}".format(out_diff))
    if not (out_eq is None):
        out_diff = set(out_eq).difference(set(model.out))
        if len(out_diff) > 0:
            raise ValueError(
                "model must contain each out_eq; missing {}".format(out_diff))

    ## Formulate initial guess
    df_nom = eval_nominal(model, df_det="nom", skip=True)
    if df_start is None:
        df_start = df_nom[model.var]

        if n_restart > 1:
            if not (seed is None):
                setseed(seed)
            ## Collect sweep-able deterministic variables
            var_sweep = list(
                filter(
                    lambda v: isfinite(model.domain.get_width(v))
                    & (model.domain.get_width(v) > 0),
                    model.var_det,
                ))
            ## Generate pseudo-marginals
            dicts_var = {}
            for v in var_sweep:
                dicts_var[v] = {
                    "dist": "uniform",
                    "loc": model.domain.get_bound(v)[0],
                    "scale": model.domain.get_width(v),
                }
            ## Overwrite model
            md_sweep = comp_marginals(model, **dicts_var)
            md_sweep = comp_copula_independence(md_sweep)
            ## Generate random start points
            df_rand = eval_sample(
                md_sweep,
                n=n_restart - 1,
                df_det="nom",
                skip=True,
            )
            df_start = concat((df_start, df_rand[model.var]),
                              axis=0).reset_index(drop=True)
    else:
        n_restart = df_start.shape[0]

    ## Factory for wrapping model's output
    def make_fun(out, sign=+1):
        def fun(x):
            df = DataFrame([x], columns=model.var)
            df_res = eval_df(model, df)
            return sign * df_res[out]

        return fun

    ## Create helper functions for constraints
    constraints = []

    if not (out_geq is None):
        for out in out_geq:
            constraints.append({
                "type": "ineq",
                "fun": make_fun(out),
            })

    if not (out_leq is None):
        for out in out_leq:
            constraints.append({
                "type": "ineq",
                "fun": make_fun(out, sign=-1),
            })

    if not (out_eq is None):
        for out in out_eq:
            constraints.append({
                "type": "eq",
                "fun": make_fun(out),
            })

    ## Parse the bounds for minimize
    bounds = list(map(lambda k: model.domain.bounds[k], model.var))

    ## Run optimization
    df_res = DataFrame()
    for i in range(n_restart):
        x0 = df_start[model.var].iloc[i].values
        res = minimize(
            make_fun(out_min),
            x0,
            args=(),
            method=method,
            jac=False,
            tol=tol,
            options={
                "maxiter": n_maxiter,
                "disp": False
            },
            constraints=constraints,
            bounds=bounds,
        )

        df_opt = df_make(
            **dict(zip(model.var, res.x)),
            **dict(zip(map(lambda s: s + "_0", model.var), x0)),
        )
        df_tmp = eval_df(model, df=df_opt)
        df_tmp["success"] = [res.success]
        df_tmp["message"] = [res.message]
        df_tmp["n_iter"] = [res.nit]

        df_res = concat((df_res, df_tmp), axis=0).reset_index(drop=True)

    return df_res
示例#4
0
def tran_sp(
    df,
    n=None,
    var=None,
    n_maxiter=500,
    tol=1e-3,
    seed=None,
    verbose=True,
    standardize=True,
):
    r"""Compact a dataset with support points

    Arguments:
        df (DataFrame): dataset to compact
        n (int): number of samples for compacted dataset
        var (list of str): list of variables to compact, must all be numeric
        n_maxiter (int): maximum number of iterations for support point algorithm
        tol (float): convergence tolerance
        verbose (bool): print messages to the console?
        standardize (bool): standardize columns before running sp? (Restores after sp)

    Returns:
        DataFrame: dataset compacted with support points

    Examples:
        >>> import grama as gr
        >>> from grama.data import df_diamonds
        >>> df_sp = gr.tran_sp(df_diamonds, n=50, var=["price", "carat"])
    """
    ## Setup
    setseed(seed)
    # Handle input variables
    if var is None:
        # Select numeric columns only
        var = list(df.select_dtypes(include=[number]).columns)
        if verbose:
            print("tran_sp has selected var = {}".format(var))
    # Extract values
    Y = df[var].values
    if standardize:
        Y_mean = Y.mean(axis=0)
        Y_sd = Y.std(axis=0)
        Y = (Y - Y_mean) / Y_sd
    # Generate initial proposal points
    X0 = _perturbed_choice(Y, n)

    ## Run sp.ccp algorithm
    X, d, iter_c = _sp_cpp(X0, Y, delta=tol, iter_max=n_maxiter)
    if verbose:
        print(
            "tran_sp finished in {0:} iterations with distance criterion {1:4.3e}"
            .format(iter_c, d))
        if d > tol:
            warn(
                "Convergence tolerance not met; d = {0:4.3e} > tol = {1:4.3e}".
                format(d, tol),
                RuntimeWarning,
            )

    if standardize:
        X = X * Y_sd + Y_mean

    ## Package results
    return DataFrame(data=X, columns=var)
示例#5
0
def eval_nls(
    model,
    df_data=None,
    out=None,
    var_fix=None,
    df_init=None,
    append=False,
    tol=1e-6,
    ftol=1e-9,
    gtol=1e-5,
    n_maxiter=100,
    n_restart=1,
    n_process=1,
    method="L-BFGS-B",
    seed=None,
    verbose=True,
):
    r"""Estimate with Nonlinear Least Squares (NLS)

    Estimate best-fit variable levels with nonlinear least squares (NLS).

    Args:
        model (gr.Model): Model to analyze. All model variables
            selected for fitting must be bounded or random. Deterministic
            variables may have semi-infinite bounds.
        df_data (DataFrame): Data for estimating parameters. Variables not
            found in df_data optimized in fitting.
        out (list or None): Output contributions to consider in computing MSE.
            Assumed to be model.out if left as None.
        var_fix (list or None): Variables to fix to nominal levels. Note that
            variables with domain width zero will automatically be fixed.
        df_init (DataFrame): Initial guesses for parameters; overrides n_restart
        append (bool): Append metadata? (Initial guess, MSE, optimizer status)
        tol (float): Optimizer convergence tolerance
        n_maxiter (int): Optimizer maximum iterations
        n_restart (int): Number of restarts; beyond n_restart=1 random
            restarts are used.
        seed (int OR None): Random seed for restarts
        verbose (bool): Print messages to console?

    Returns:
        DataFrame: Results of estimation

    Examples:
        >>> import grama as gr
        >>> from grama.data import df_trajectory_full
        >>> from grama.models import make_trajectory_linear
        >>>
        >>> md_trajectory = make_trajectory_linear()
        >>>
        >>> df_fit = (
        >>>     md_trajectory
        >>>     >> gr.ev_nls(df_data=df_trajectory_full)
        >>> )
        >>>
        >>> print(df_fit)

    """
    ## Check `out` invariants
    if out is None:
        out = model.out
        if verbose:
            print("... eval_nls setting out = {}".format(out))
    set_diff = set(out).difference(set(df_data.columns))
    if len(set_diff) > 0:
        raise ValueError("out must be subset of df_data.columns\n" +
                         "difference = {}".format(set_diff))

    ## Determine variables to be fixed
    if var_fix is None:
        var_fix = set()
    else:
        var_fix = set(var_fix)
    for var in model.var_det:
        wid = model.domain.get_width(var)
        if wid == 0:
            var_fix.add(var)
    if verbose:
        print("... eval_nls setting var_fix = {}".format(list(var_fix)))
    var_fix = list(var_fix)

    ## Determine variables for evaluation
    var_feat = set(model.var).intersection(set(df_data.columns))
    if verbose:
        print("... eval_nls setting var_feat = {}".format(var_feat))
    var_feat = list(var_feat)

    ## Determine variables for fitting
    var_fit = set(model.var).difference(set(var_fix).union(set(var_feat)))
    if len(var_fit) == 0:
        raise ValueError("No var selected for fitting!\n" +
                         "Try checking model bounds and df_data.columns.")
    var_fit = list(var_fit)

    ## Separate var_fit into det and rand
    var_fit_det = list(set(model.var_det).intersection(var_fit))
    var_fit_rand = list(set(model.var_rand).intersection(var_fit))

    ## Construct bounds, fix var_fit order
    var_fit = var_fit_det + var_fit_rand
    bounds = []
    var_prob = []
    for var in var_fit_det:
        if not isfinite(model.domain.get_nominal(var)):
            var_prob.append(var)
        bounds.append(model.domain.get_bound(var))
    if len(var_prob) > 0:
        raise ValueError(
            "all variables to be fitted must finite nominal value\n" +
            "offending var = {}".format(var_prob))

    for var in var_fit_rand:
        bounds.append((
            model.density.marginals[var].q(0),
            model.density.marginals[var].q(1),
        ))

    ## Determine initial guess points
    df_nom = eval_nominal(model, df_det="nom", skip=True)

    ## Use specified initial guess(es)
    if not (df_init is None):
        # Check invariants
        set_diff = list(set(var_fit).difference(set(df_init.columns)))
        if len(set_diff) > 0:
            raise ValueError("var_fit must be subset of df_init.columns\n" +
                             "difference = {}".format(set_diff))
        # Pull n_restart
        n_restart = df_init.shape[0]

    ## Generate initial guess(es)
    else:

        df_init = df_nom[var_fit]
        if n_restart > 1:
            if not (seed is None):
                setseed(seed)
            ## Collect sweep-able deterministic variables
            var_sweep = list(
                filter(
                    lambda v: isfinite(model.domain.get_width(v))
                    & (model.domain.get_width(v) > 0),
                    model.var_det,
                ))
            ## Generate pseudo-marginals
            dicts_var = {}
            for v in var_sweep:
                dicts_var[v] = {
                    "dist": "uniform",
                    "loc": model.domain.get_bound(v)[0],
                    "scale": model.domain.get_width(v),
                }
            ## Overwrite model
            md_sweep = comp_marginals(model, **dicts_var)
            md_sweep = comp_copula_independence(md_sweep)
            ## Generate random start points
            df_rand = eval_sample(
                md_sweep,
                n=n_restart - 1,
                df_det="nom",
                skip=True,
            )
            df_init = concat((df_init, df_rand[var_fit]),
                             axis=0).reset_index(drop=True)

    ## Iterate over initial guesses
    df_res = DataFrame()

    def fun_mp(i):
        x0 = df_init[var_fit].iloc[i].values

        ## Build evaluator
        def objective(x):
            """x = [var_fit]"""
            ## Evaluate model
            df_var = tran_outer(
                df_data[var_feat],
                concat(
                    (df_nom[var_fix].iloc[[0]],
                     df_make(**dict(zip(var_fit, x)))),
                    axis=1,
                ),
            )
            df_tmp = eval_df(model, df=df_var)

            ## Compute joint MSE
            return ((df_tmp[out].values - df_data[out].values)**2).mean()

        ## Run optimization
        res = minimize(
            objective,
            x0,
            args=(),
            method=method,
            jac=False,
            tol=tol,
            options={
                "maxiter": n_maxiter,
                "disp": False,
                "ftol": ftol,
                "gtol": gtol,
            },
            bounds=bounds,
        )

        df_tmp = df_make(
            **dict(zip(var_fit, res.x)),
            **dict(zip(map(lambda s: s + "_0", var_fit), x0)),
        )
        df_tmp["success"] = [res.success]
        df_tmp["message"] = [res.message]
        df_tmp["n_iter"] = [res.nit]
        df_tmp["mse"] = [res.fun]
        return df_tmp

    df_res = DataFrame()
    for i in range(n_restart):
        df_tmp = fun_mp(i)
        df_res = concat((df_res, df_tmp), axis=0).reset_index(drop=True)

    ## Post-process
    if append:
        return df_res
    return df_res[var_fit]