Exemplo n.º 1
0
def fun_sir(df, rtol=1e-4):
    r"""Fully-vectorized SIR solver

    SIR IVP solver, vectorized over parameter values **and** time. The routine identifies groups of parameter values and runs a vectorized IVP solver over all associated time points, and gathers all results into a single output DataFrame. Intended for use in a grama model.

    Args:
        df (pd.DataFrame): All input values; must contain columns for all SIR parameters

    Preconditions:
        ["t", "S0", "I0", "R0", "beta", "gamma"] in df.columns

    Postconditions:
        Row-ordering of input data is reflected in the row-ordering of the output.

    Returns:
        pd.DataFrame: Solution results
    """
    ## Find all groups of non-t parameters
    df_grouped = (df >> gr.tf_mutate(_idx=DF.index,
                                     _code=gr.str_c(
                                         "S0",
                                         DF.S0,
                                         "I0",
                                         DF.I0,
                                         "R0",
                                         DF.R0,
                                         "beta",
                                         DF.beta,
                                         "gamma",
                                         DF.gamma,
                                     )))

    ## Run time-vectorized SIR solver over each group
    df_results = gr.df_grid()
    codes = set(df_grouped._code)
    for code in codes:
        df_param = (df_grouped >> gr.tf_filter(DF._code == code))

        df_results = (df_results >> gr.tf_bind_rows(
            sir_vtime(
                df_param.t,
                df_param.S0[0],
                df_param.I0[0],
                df_param.R0[0],
                df_param.beta[0],
                df_param.gamma[0],
                rtol=rtol,
            ) >> gr.tf_mutate(_idx=df_param._idx)))

    ## Sort to match original ordering
    # NOTE: Without this, the output rows will be scrambled, relative
    # to the input rows, leading to very confusing output!
    return (df_results >> gr.tf_arrange(DF._idx) >> gr.tf_drop("_idx"))
Exemplo n.º 2
0
def plot_xbs(df, group, var, n_side=9, n_delta=6):
    r"""Construct Xbar and S chart

    Construct an Xbar and S chart to assess the state of statistical control of
    a dataset.

    Args:
        df (DataFrame): Data to analyze
        group (str): Variable for grouping
        var (str): Variable to study

    Keyword args:
        n_side (int): Number of consecutive runs above/below centerline to flag
        n_delta (int): Number of consecutive runs increasing/decreasing to flag

    Returns:
        plotnine object: Xbar and S chart

    Examples::

        import grama as gr
        DF = gr.Intention()

        from grama.data import df_shewhart
        (
            df_shewhart
            >> gr.tf_mutate(idx=DF.index // 10)
            >> gr.pt_xbs("idx", "tensile_strength")
        )

    """
    ## Prepare the data
    DF = Intention()
    df_batched = (df >> tf_group_by(group) >> tf_summarize(
        X=mean(DF[var]),
        S=sd(DF[var]),
        n=nfcn(DF.index),
    ) >> tf_ungroup())

    df_stats = (df_batched >> tf_summarize(
        X_center=mean(DF.X),
        S_biased=mean(DF.S),
        n=mean(DF.n),
    ))
    n = df_stats.n[0]
    df_stats["S_center"] = df_stats.S_biased / c_sd(n)
    df_stats["X_LCL"] = df_stats.X_center - 3 * df_stats.S_center / sqrt(n)
    df_stats["X_UCL"] = df_stats.X_center + 3 * df_stats.S_center / sqrt(n)
    df_stats["S_LCL"] = B3(n) * df_stats.S_center
    df_stats["S_UCL"] = B4(n) * df_stats.S_center

    ## Reshape for plotting
    df_stats_long = (df_stats >> tf_pivot_longer(
        columns=["X_LCL", "X_center", "X_UCL", "S_LCL", "S_center", "S_UCL"],
        names_to=["_var", "_stat"],
        names_sep="_",
        values_to="_value",
    ))
    # Fake group value to avoid issue with discrete group variable
    df_stats_long[group] = [df_batched[group].values[0]
                            ] * df_stats_long.shape[0]

    df_batched_long = (
        df_batched >> tf_pivot_longer(
            columns=["X", "S"],
            names_to="_var",
            values_to="_value",
        )
        ## Flag patterns
        >> tf_left_join(
            df_stats >> tf_pivot_longer(
                columns=[
                    "X_LCL", "X_center", "X_UCL", "S_LCL", "S_center", "S_UCL"
                ],
                names_to=["_var", ".value"],
                names_sep="_",
            ),
            by="_var",
        ) >> tf_group_by("_var") >> tf_mutate(
            outlier_below=(DF._value < DF.LCL),  # Outside control limits
            outlier_above=(DF.UCL < DF._value),
            below=consec(DF._value < DF.center, i=n_side),  # Below mean
            above=consec(DF.center < DF._value, i=n_side),  # Above mean
        ) >> tf_mutate(
            decreasing=consec((lead(DF._value) - DF._value) < 0, i=n_delta - 1)
            |  # Decreasing
            consec((DF._value - lag(DF._value)) < 0, i=n_delta - 1),
            increasing=consec(0 < (lead(DF._value) - DF._value), i=n_delta - 1)
            |  # Increasing
            consec(0 < (DF._value - lag(DF._value)), i=n_delta - 1),
        ) >> tf_mutate(
            sign=case_when([DF.outlier_below, "-2"], [DF.outlier_above, "+2"],
                           [DF.below | DF.decreasing, "-1"],
                           [DF.above | DF.increasing, "+1"], [True, "0"]),
            glyph=case_when(
                [DF.outlier_below, "Below Limit"],
                [DF.outlier_above, "Above Limit"],
                [DF.below, "Low Run"],
                [DF.above, "High Run"],
                [DF.increasing, "Increasing Run"],
                [DF.decreasing, "Decreasing Run"],
                [True, "None"],
            )) >> tf_ungroup())

    ## Visualize
    return (df_batched_long >> ggplot(aes(x=group)) + geom_hline(
        data=df_stats_long,
        mapping=aes(yintercept="_value", linetype="_stat"),
    ) + geom_line(aes(y="_value", group="_var"), size=0.2) + geom_point(
        aes(y="_value", color="sign", shape="glyph"),
        size=3,
    ) + scale_color_manual(values={
        "-2": "blue",
        "-1": "darkturquoise",
        "0": "black",
        "+1": "salmon",
        "+2": "red"
    }, ) + scale_shape_manual(
        name="Patterns",
        values={
            "Below Limit": "s",
            "Above Limit": "s",
            "Low Run": "X",
            "High Run": "X",
            "Increasing Run": "^",
            "Decreasing Run": "v",
            "None": "."
        },
    ) + scale_linetype_manual(
        name="Guideline",
        values=dict(LCL="dashed", UCL="dashed", center="solid"),
    ) + guides(color=None) + facet_grid(
        "_var~.",
        scales="free_y",
        labeller=labeller(dict(X="Mean", S="Variability")),
    ) + labs(
        x="Group variable ({})".format(group),
        y="Value ({})".format(var),
    ))
Exemplo n.º 3
0
def tran_kfolds(
    df,
    k=None,
    ft=None,
    out=None,
    var_fold=None,
    suffix="_mean",
    summaries=None,
    tf=tf_summarize,
    shuffle=True,
    seed=None,
):
    r"""Perform k-fold CV

    Perform k-fold cross-validation (CV) using a given fitting procedure (ft).
    Optionally provide a fold identifier column, or (randomly) assign folds.

    Args:
        df (DataFrame): Data to pass to given fitting procedure
        ft (gr.ft_): Partially-evaluated grama fit function; defines model fitting
            procedure and outputs to aggregate
        tf (gr.tf_): Partially-evaluated grama transform function; evaluation of
            fitted model will be passed to tf and provided with keyword arguments
            from summaries
        out (list or None): Outputs for which to compute `summaries`; None uses ft.out
        var_fold (str or None): Column to treat as fold identifier; overrides `k`
        suffix (str): Suffix for predicted value; used to distinguish between predicted and actual
        summaries (dict of functions): Summary functions to pass to tf; will be evaluated
            for outputs of ft. Each summary must have signature summary(f_pred, f_meas).
            Grama includes builtin options: gr.mse, gr.rmse, gr.rel_mse, gr.rsq, gr.ndme
        k (int): Number of folds; k=5 to k=10 recommended [1]
        shuffle (bool): Shuffle the data before CV? True recommended [1]

    Notes:
        - Many grama functions support *partial evaluation*; this allows one to specify things like hyperparameters in fitting functions without providing data and executing the fit. You can take advantage of this functionality to easly do hyperparameter studies.

    Returns:
        DataFrame: Aggregated results within each of k-folds using given model and
            summary transform

    References:
        [1] James, Witten, Hastie, and Tibshirani, "An introduction to statistical learning" (2017), Chapter 5. Resampling Methods

    Examples:

        >>> import grama as gr
        >>> from grama.data import df_stang
        >>> from grama.fit import ft_rf
        >>> df_kfolds = (
        >>>     df_stang
        >>>     >> gr.tf_kfolds(
        >>>         k=5,
        >>>         ft=ft_rf(out=["thick"], var=["E", "mu"]),
        >>>     )

    """
    ## Check invariants
    if ft is None:
        raise ValueError("Must provide ft keyword argument")
    if (k is None) and (var_fold is None):
        print("... tran_kfolds is using default k=5")
        k = 5
    if summaries is None:
        print("... tran_kfolds is using default summaries mse and rsq")
        summaries = dict(mse=mse, rsq=rsq)

    n = df.shape[0]
    ## Handle custom folds
    if not (var_fold is None):
        ## Check for a valid var_fold
        if not (var_fold in df.columns):
            raise ValueError("var_fold must be in df.columns or None")
        ## Build folds
        levels = unique(df[var_fold])
        k = len(levels)
        print("... tran_kfolds found {} levels via var_folds".format(k))
        Is = []
        for l in levels:
            Is.append(list(arange(n)[df[var_fold] == l]))

    else:
        ## Shuffle data indices
        if shuffle:
            if seed:
                set_seed(seed)
            I = permutation(n)
        else:
            I = arange(n)
        ## Build folds
        di = int(ceil(n / k))
        Is = [I[i * di:min((i + 1) * di, n)] for i in range(k)]

    ## Iterate over folds
    df_res = DataFrame()
    for i in range(k):
        ## Train by out-of-fold data
        md_fit = df >> tf_filter(~var_in(X.index, Is[i])) >> ft

        ## Determine predicted and actual
        if out is None:
            out = str_replace(md_fit.out, suffix, "")
        else:
            out = str_replace(out, suffix, "")

        ## Test by in-fold data
        df_pred = md_fit >> ev_df(df=df >> tf_filter(var_in(X.index, Is[i])),
                                  append=False)

        ## Specialize summaries for output names
        summaries_all = ChainMap(*[{
            key + "_" + o: fun(X[o + suffix], X[o])
            for key, fun in summaries.items()
        } for o in out])

        ## Aggregate
        df_summary_tmp = (
            df_pred >>
            tf_bind_cols(df[out] >> tf_filter(var_in(X.index, Is[i]))) >>
            tf(**summaries_all)
            # >> tf_mutate(_kfold=i)
        )

        if var_fold is None:
            df_summary_tmp = df_summary_tmp >> tf_mutate(_kfold=i)
        else:
            df_summary_tmp[var_fold] = levels[i]

        df_res = concat((df_res, df_summary_tmp),
                        axis=0).reset_index(drop=True)

    return df_res
Exemplo n.º 4
0
def get_buoyant_properties(df_hull_rot, df_mass, w_slope, w_intercept):
    r"""
    Args:
        df_hull_rot (DataFrame): Rotated hull points
        df_mass (DataFrame): Mass properties
        eq_water (lambda): takes in an x value, spits out a y
    """
    # x and y intervals
    dx = df_mass.dx[0]
    dy = df_mass.dy[0]

    # Define equation for the surface of the water using slope intercept form
    eq_water = lambda x: w_slope * x + w_intercept

    # Find points under sloped waterline
    df_hull_under = (df_hull_rot >> gr.tf_mutate(
        under=df_hull_rot.y <= eq_water(df_hull_rot.x)) >>
                     gr.tf_filter(DF.under == True))

    # x and y position of COB
    x_cob = average(df_hull_under.x)
    y_cob = average(df_hull_under.y)

    # Pull x and y of COM as well
    x_com = df_mass.x[0]
    y_com = df_mass.y[0]

    # Total mass of water by finding area under curve
    m_water = RHO_WATER * len(df_hull_under) * dx * dy

    # Net force results from the difference in masses between the boat and the water
    F_net = (m_water - df_mass.mass[0]) * G

    # Distance to determine torque is ORTHOGONAL TO WATERLINE
    # Equation from https://www.geeksforgeeks.org/perpendicular-distance-between-a-point-and-a-line-in-2-d/

    # Calculate righting moment for different cases of w_slope
    # Account for zero slope
    if w_slope == 0:
        M_net = G * m_water * x_cob
    else:
        #         norm_dist = ((1 * y_com) + (1/w_slope * x_com) + ((1/w_slope) * x_cob) + y_cob) / np.sqrt(1 + (1/w_slope)**2)
        a = 1 / w_slope
        b = 1
        c = (1 / w_slope) * x_cob + y_cob
        x1 = x_com
        y1 = y_com

        norm_dist = (a * x1 + b * y1 + c) / np.sqrt(a**2 + b**2)

        if w_slope > 0:  # positive water slope creates a positive moment
            M_net = G * m_water * norm_dist
        elif w_slope < 0:  # negative water slope creates a negative moment
            M_net = -G * m_water * norm_dist

    return DataFrame(dict(
        x=[x_cob],
        y=[y_cob],
        F_net=[F_net],
        M_net=[M_net],
    ))
Exemplo n.º 5
0
def tlmc_1f1m(md, N0, eps):

    import numpy as np
    import grama as gr
    X = gr.Intention()

    md1f1m = gr.make_tlmc_model_1f1m()

    # Check that md is OK --> same # inputs/outputs
    # Check inputs
    try:
        r = md.functions[0].func(0, 0)
    except TypeError:
        print(
            'Input model must have 2 inputs: level and point at which to evaluate.'
        )

    # Check outputs
    r = md.functions[0].func(0, 0)
    if len(r) != 2:
        raise ValueError(
            'Level 0 function must have 2 outputs: result and cost.')
        r = md.functions[0].func(1, 0)
    if len(r) != 2:
        raise ValueError(
            'Level 1 function must have 2 outputs: result and cost.')

    # Check that md has 1 function
    if len(md.functions) != 1:
        raise ValueError('Input model must have 1 function.')

    # make sure N0 and eps are greater than 0
    if ((N0 <= 0) | (eps <= 0)):  # make sure N0 and eps are greater than 0
        raise ValueError('N0 and eps must be > 0.')

    its = 0  # initialize iteration counter

    Nlev = np.zeros((1, 2))  # samples taken per level (initialize)
    dNlev = np.array([[N0, N0]])  # samples left to take per level (initialize)
    Vlev = np.zeros((1, 2))  # variance per level (initialize)
    sumlev = np.zeros((2, 2))  # sample results per level (initialize)
    costlev = np.zeros((1, 2))  # total cost per level (initialize)

    while np.sum(dNlev) > 0:  # check if there are samples left to be evaluated
        for lev in range(2):
            if dNlev[
                    0,
                    lev] > 0:  # check if there are samples to be evaluated on level 'lev'
                df_mc_lev = md1f1m >> gr.ev_monte_carlo(
                    n=dNlev[0, lev], df_det=gr.df_make(level=lev))
                if lev > 0:
                    df_prev = df_mc_lev >> gr.tf_select(
                        gr.columns_between(
                            "x", "level")) >> gr.tf_mutate(level=X.level - 1)
                    df_mc_lev_prev = md1f1m >> gr.ev_df(df_prev)
                    Y = df_mc_lev.P - df_mc_lev_prev.P
                    C = sum(df_mc_lev.cost) + sum(df_mc_lev_prev.cost)
                else:
                    Y = df_mc_lev.P
                    C = sum(df_mc_lev.cost)

                cost = C
                sums = [sum(Y), sum(Y**2)]

                Nlev[0, lev] = Nlev[0, lev] + dNlev[
                    0, lev]  # update samples taken on level 'lev'
                sumlev[0, lev] = sumlev[0, lev] + sums[
                    0]  # update sample results on level 'lev'
                sumlev[1, lev] = sumlev[1, lev] + sums[
                    1]  # update sample results on level 'lev'
                costlev[0, lev] = costlev[
                    0, lev] + cost  # update total cost on level 'lev'

        mlev = np.abs(sumlev[0, :] / Nlev)  # expected value per level
        Vlev = np.maximum(
            0, (sumlev[1, :] / Nlev - mlev**2))  # variance per level
        Clev = costlev / Nlev  # cost per result per level

        mu = eps**(-2) * sum(np.sqrt(
            Vlev *
            Clev))  # Lagrange multiplier to minimize variance for a fixed cost
        Ns = np.ceil(
            mu * np.sqrt(Vlev / Clev))  # optimal number of samples per level
        dNlev = np.maximum(0,
                           Ns - Nlev)  # update samples left to take per level
        its += 1  # update counter

    P = np.sum(sumlev[0, :] / Nlev)  # evaluate two-level estimator
    return P, Nlev, Vlev, its