예제 #1
0
def fit(init_file, semipar=False):
    """This function estimates the MTE based on a parametric normal model
    or, alternatively, via the semiparametric method of
    local instrumental variables (LIV).

    Parameters
    ----------
    init_file: yaml
        Initialization file containing parameters for the estimation
        process.

    Returns
    ------
    rslt: dict
        Result dictionary containing
        - quantiles
        - mte
        - mte_x
        - mte_u
        - mte_min
        - mte_max
        - X
        - b1
        - b0
    """

    # Load the estimation file
    dict_ = read(init_file, semipar)

    # Perform some consistency checks given the user's request
    check_presence_estimation_dataset(dict_)
    check_est_init_dict(dict_)

    # Semiparametric LIV Model
    if semipar:
        # Distribute initialization information.
        data = read_data(dict_["ESTIMATION"]["file"])
        dict_, data = check_append_constant(init_file,
                                            dict_,
                                            data,
                                            semipar=True)

        rslt = semipar_fit(dict_, data)

    # Parametric Normal Model
    else:
        # Perform some extra checks
        check_par_init_file(dict_)

        # Distribute initialization information.
        data = read_data(dict_["ESTIMATION"]["file"])
        dict_, data = check_append_constant(init_file,
                                            dict_,
                                            data,
                                            semipar=False)

        rslt = par_fit(dict_, data)

    return rslt
예제 #2
0
def test12():
    """This test checks if our data import process is able to handle .txt, .dta and .pkl
     files.
     """

    pkl = TEST_RESOURCES_DIR + "/data.grmpy.pkl"
    dta = TEST_RESOURCES_DIR + "/data.grmpy.dta"
    txt = TEST_RESOURCES_DIR + "/data.grmpy.txt"

    real_sum = -3211.20122
    real_column_values = [
        "Y",
        "D",
        "X1",
        "X2",
        "X3",
        "X5",
        "X4",
        "Y1",
        "Y0",
        "U1",
        "U0",
        "V",
    ]

    for data in [pkl, dta, txt]:
        df = read_data(data)
        sum_ = np.sum(df.sum())
        columns = list(df)
        np.testing.assert_array_almost_equal(sum_, real_sum, decimal=5)
        np.testing.assert_equal(columns, real_column_values)
def par_fit(init_file):
    """The function estimates the coefficients of the simulated data set."""
    check_presence_init(init_file)

    dict_ = read(init_file)
    np.random.seed(dict_["SIMULATION"]["seed"])

    # We perform some basic consistency checks regarding the user's request.
    check_presence_estimation_dataset(dict_)
    #check_initialization_dict2(dict_)
    #check_init_file(dict_)

    # Distribute initialization information.
    data = read_data(dict_["ESTIMATION"]["file"])
    num_treated = dict_["AUX"]["num_covars_treated"]
    num_untreated = num_treated + dict_["AUX"]["num_covars_untreated"]

    _, X1, X0, Z1, Z0, Y1, Y0 = process_data(data, dict_)

    if dict_["ESTIMATION"]["maxiter"] == 0:
        option = "init"
    else:
        option = dict_["ESTIMATION"]["start"]

    # Read data frame

    # define starting values
    x0 = start_values(dict_, data, option)
    opts, method = optimizer_options(dict_)
    dict_["AUX"]["criteria"] = calculate_criteria(dict_, X1, X0, Z1, Z0, Y1,
                                                  Y0, x0)
    dict_["AUX"]["starting_values"] = backward_transformation(x0)
    rslt_dict = bfgs_dict()
    if opts["maxiter"] == 0:
        rslt = adjust_output(None, dict_, x0, X1, X0, Z1, Z0, Y1, Y0,
                             rslt_dict)
    else:
        opt_rslt = minimize(
            minimizing_interface,
            x0,
            args=(dict_, X1, X0, Z1, Z0, Y1, Y0, num_treated, num_untreated,
                  rslt_dict),
            method=method,
            options=opts,
        )
        rslt = adjust_output(opt_rslt, dict_, opt_rslt["x"], X1, X0, Z1, Z0,
                             Y1, Y0, rslt_dict)
    # Print Output files
    print_logfile(dict_, rslt)

    if "comparison" in dict_["ESTIMATION"].keys():
        if dict_["ESTIMATION"]["comparison"] == 0:
            pass
        else:
            write_comparison(data, rslt)
    else:
        write_comparison(data, rslt)

    return rslt
예제 #4
0
파일: estimate.py 프로젝트: fagan2888/grmpy
def fit(init_file, semipar=False):
    """This function estimates the MTE based on a parametric normal model
    or, alternatively, via the semiparametric method of
    local instrumental variables (LIV).
    """

    # Load the estimation file
    check_presence_init(init_file)
    dict_ = read(init_file, semipar)

    # Perform some consistency checks given the user's request
    check_presence_estimation_dataset(dict_)
    check_est_init_dict(dict_)

    # Semiparametric LIV Model
    if semipar is True:
        # Distribute initialization information.
        data = read_data(dict_["ESTIMATION"]["file"])
        dict_, data = check_append_constant(init_file,
                                            dict_,
                                            data,
                                            semipar=True)

        rslt = semipar_fit(dict_, data)

    # Parametric Normal Model
    else:
        # Perform some extra checks
        check_par_init_file(dict_)

        # Distribute initialization information.
        data = read_data(dict_["ESTIMATION"]["file"])
        dict_, data = check_append_constant(init_file,
                                            dict_,
                                            data,
                                            semipar=False)

        rslt = par_fit(dict_, data)

    return rslt
예제 #5
0
def plot_common_support(init_file, nbins, fs=24, output=False):
    """This function plots histograms of the treated and untreated population
    to assess the common support of the propensity score"""
    dict_ = read(init_file)

    # Distribute initialization information.
    data = read_data(dict_["ESTIMATION"]["file"])

    # Process data for the semiparametric estimation.
    indicator = dict_["ESTIMATION"]["indicator"]
    D = data[indicator].values
    Z = data[dict_["CHOICE"]["order"]]
    logit = dict_["ESTIMATION"]["logit"]

    # estimate propensity score
    ps = estimate_treatment_propensity(D, Z, logit, show_output=False)

    data["ps"] = ps

    treated = data[[indicator, "ps"]][data[indicator] == 1].values
    untreated = data[[indicator, "ps"]][data[indicator] == 0].values

    treated = treated[:, 1].tolist()
    untreated = untreated[:, 1].tolist()

    # Make the histogram using a list of lists
    fig = plt.figure(figsize=(17.5, 10))
    hist = plt.hist(
        [treated, untreated],
        bins=nbins,
        weights=[
            np.ones(len(treated)) / len(treated),
            np.ones(len(untreated)) / len(untreated),
        ],
        density=0,
        alpha=0.55,
        label=["Treated", "Untreated"],
    )

    # Plot formatting
    plt.tick_params(axis="both", labelsize=14)
    plt.legend(loc="upper right", prop={"size": 14})
    plt.xticks(np.arange(0, 1.1, step=0.1))
    plt.grid(axis="y", alpha=0.25)
    plt.xlabel("$P$", fontsize=fs)
    plt.ylabel("$f(P)$", fontsize=fs)
    # plt.title('Support of $P(\hat{Z})$ for $D=1$ and $D=0$', fontsize=fs)

    if not output is False:
        plt.savefig(output, dpi=300)

    fig.show()
예제 #6
0
def estimate(init_file):
    """The function estimates the coefficients of the simulated data set."""
    check_presence_init(init_file)

    dict_ = read(init_file)
    np.random.seed(dict_['SIMULATION']['seed'])

    # We perform some basic consistency checks regarding the user's request.
    check_presence_estimation_dataset(dict_)
    check_initialization_dict(dict_)
    check_init_file(dict_)

    # Distribute initialization information.
    data_file = dict_['ESTIMATION']['file']

    if dict_['ESTIMATION']['maxiter'] == 0:
        option = 'init'
    else:
        option = dict_['ESTIMATION']['start']

    # Read data frame
    data = read_data(data_file)

    # define starting values
    x0 = start_values(dict_, data, option)
    opts, method = optimizer_options(dict_)
    dict_['AUX']['criteria'] = calculate_criteria(dict_, data, x0)
    dict_['AUX']['starting_values'] = backward_transformation(x0)
    rslt_dict = bfgs_dict()
    if opts['maxiter'] == 0:
        rslt = adjust_output(None, dict_, x0, data, rslt_dict)
    else:
        opt_rslt = minimize(minimizing_interface,
                            x0,
                            args=(dict_, data, rslt_dict),
                            method=method,
                            options=opts)
        rslt = adjust_output(opt_rslt, dict_, opt_rslt['x'], data, rslt_dict)
    # Print Output files
    print_logfile(dict_, rslt)

    if 'comparison' in dict_['ESTIMATION'].keys():
        if dict_['ESTIMATION']['comparison'] == 0:
            pass
        else:
            write_comparison(dict_, data, rslt)
    else:
        write_comparison(dict_, data, rslt)

    return rslt
예제 #7
0
def test13():
    """This test checks if our data import process is able to handle .txt, .dta and .pkl files."""

    pkl = TEST_RESOURCES_DIR + '/data.grmpy.pkl'
    dta = TEST_RESOURCES_DIR + '/data.grmpy.dta'
    txt = TEST_RESOURCES_DIR + '/data.grmpy.txt'

    real_sum = -3211.20122
    real_column_values = [
        'Y', 'D', 'X1', 'X2', 'X3', 'X5', 'X4', 'Y1', 'Y0', 'U1', 'U0', 'V'
    ]

    for data in [pkl, dta, txt]:
        df = read_data(data)
        sum = np.sum(df.sum())
        columns = list(df)
        np.testing.assert_array_almost_equal(sum, real_sum, decimal=5)
        np.testing.assert_equal(columns, real_column_values)

    cleanup()
예제 #8
0
파일: plot.py 프로젝트: fagan2888/grmpy
def plot_mte(
    rslt,
    init_file,
    college_years=4,
    font_size=22,
    label_size=16,
    color="blue",
    semipar=False,
    nboot=250,
    save_plot=False,
):
    """This function calculates the marginal treatment effect for
    different quantiles u_D of the unobservables.
    Depending on the model specification, either the parametric or
    semiparametric MTE is plotted along with the corresponding
    90 percent confidence bands.
    """
    # Read init dict and data
    dict_ = read(init_file, semipar)
    data = read_data(dict_["ESTIMATION"]["file"])

    dict_, data = check_append_constant(init_file, dict_, data, semipar)

    if semipar is True:
        quantiles, mte, con_u, con_d = mte_and_cof_int_semipar(
            rslt, init_file, college_years, nboot
        )

    else:
        quantiles, mte, con_u, con_d = mte_and_cof_int_par(
            rslt, dict_, data, college_years
        )

    # Add confidence intervals to rslt dictionary
    rslt.update({"con_u": con_u, "con_d": con_d})

    plot_curve(mte, quantiles, con_u, con_d, font_size, label_size, color, save_plot)
예제 #9
0
def bootstrap(init_file, nboot):
    """
    This function generates bootsrapped standard errors
    given an init_file and the number of bootstraps to be drawn.

    Parameters
    ----------
    init_file: yaml
        Initialization file containing parameters for the estimation
        process.
    nboot: int
        Number of bootstrap iterations, i.e. number of times
        the MTE is computed via bootstrap.

    Returns
    -------
    mte_boot: np.ndarray
        Array containing *nbootstrap* estimates of the MTE.
    """
    check_presence_init(init_file)
    dict_ = read(init_file, semipar=True)

    # Process the information specified in the initialization file
    bins, logit, bandwidth, gridsize, startgrid, endgrid = process_primary_inputs(
        dict_)
    trim, rbandwidth, reestimate_p, show_output = process_secondary_inputs(
        dict_)

    # Suppress output
    show_output = False

    # Prepare empty array to store output values
    mte_boot = np.zeros([gridsize, nboot])

    # Load the baseline data
    data = read_data(dict_["ESTIMATION"]["file"])

    counter = 0
    while counter < nboot:
        boot_data = resample(data,
                             replace=True,
                             n_samples=len(data),
                             random_state=None)

        # Estimate propensity score P(z)
        boot_data = estimate_treatment_propensity(dict_, boot_data, logit,
                                                  show_output)
        prop_score = boot_data["prop_score"]
        if isinstance(prop_score, pd.Series):
            # Define common support and trim the data (if trim=True)
            X, Y, prop_score = trim_support(dict_,
                                            boot_data,
                                            logit,
                                            bins,
                                            trim,
                                            reestimate_p,
                                            show_output=False)

            b0, b1_b0 = double_residual_reg(X, Y, prop_score)

            # # Construct the MTE
            mte_x = mte_observed(X, b1_b0)
            mte_u = mte_unobserved_semipar(X, Y, b0, b1_b0, prop_score,
                                           bandwidth, gridsize, startgrid,
                                           endgrid)

            # Put the MTE together
            mte = mte_x.mean(axis=0) + mte_u
            mte_boot[:, counter] = mte

            counter += 1

        else:
            continue

    return mte_boot
예제 #10
0
def semipar_fit(init_file):
    """This functions estimates the MTE via Local Instrumental Variables"""
    check_presence_init(init_file)

    dict_ = read(init_file)
    # np.random.seed(dict_["SIMULATION"]["seed"]) # needed?

    check_presence_estimation_dataset(dict_)
    check_initialization_dict(dict_)

    # Distribute initialization information.
    data = read_data(dict_["ESTIMATION"]["file"])

    # Process data for the semiparametric estimation.
    indicator = dict_["ESTIMATION"]["indicator"]
    D = data[indicator].values
    Z = data[dict_["CHOICE"]["order"]]

    nbins = dict_["ESTIMATION"]["nbins"]
    trim = dict_["ESTIMATION"]["trim_support"]
    reestimate = dict_["ESTIMATION"]["reestimate_p"]
    rbandwidth = dict_["ESTIMATION"]["rbandwidth"]
    bandwidth = dict_["ESTIMATION"]["bandwidth"]
    gridsize = dict_["ESTIMATION"]["gridsize"]
    a = dict_["ESTIMATION"]["ps_range"][0]
    b = dict_["ESTIMATION"]["ps_range"][1]

    logit = dict_["ESTIMATION"]["logit"]
    show_output = dict_["ESTIMATION"]["show_output"]

    # The Local Instrumental Variables (LIV) approach

    # 1. Estimate propensity score P(z)
    ps = estimate_treatment_propensity(D, Z, logit, show_output)

    # 2a. Find common support
    treated, untreated, common_support = define_common_support(
        ps, indicator, data, nbins, show_output
    )

    # 2b. Trim the data
    if trim is True:
        data, ps = trim_data(ps, common_support, data)

    # 2c. Re-estimate baseline propensity score on the trimmed sample
    if reestimate is True:
        D = data[indicator].values
        Z = data[dict_["CHOICE"]["order"]]

        # Re-estimate propensity score P(z)
        ps = estimate_treatment_propensity(D, Z, logit, show_output)

    # 3. Double Residual Regression
    # Sort data by ps
    data = data.sort_values(by="ps", ascending=True)
    ps = np.sort(ps)

    X = data[dict_["TREATED"]["order"]]
    Xp = construct_Xp(X, ps)
    Y = data[[dict_["ESTIMATION"]["dependent"]]]

    b0, b1_b0 = double_residual_reg(ps, X, Xp, Y, rbandwidth, show_output)

    # Turn the X, Xp, and Y DataFrames into np.ndarrays
    X_arr = np.array(X)
    Xp_arr = np.array(Xp)
    Y_arr = np.array(Y).ravel()

    # 4. Compute the unobserved part of Y
    Y_tilde = Y_arr - np.dot(X_arr, b0) - np.dot(Xp_arr, b1_b0)

    # 5. Estimate mte_u, the unobserved component of the MTE,
    # through a locally quadratic regression
    quantiles, mte_u = locpoly(ps, Y_tilde, 1, 2, bandwidth, gridsize, a, b)

    # 6. construct MTE
    # Calculate the MTE component that depends on X
    # mte_x = np.dot(X, b1_b0).mean(axis=0)

    # Put the MTE together
    # mte = mte_x + mte_u

    return quantiles, mte_u, X, b1_b0
예제 #11
0
def plot_mte(
    rslt,
    init_file,
    college_years=4,
    font_size=22,
    label_size=16,
    color="blue",
    semipar=False,
    nboot=250,
    save_plot=False,
):
    """
    This function calculates the marginal treatment effect for
    different quantiles u_D of the unobservables.
    Depending on the model specification, either the parametric or
    semiparametric MTE is plotted along with the corresponding
    90 percent confidence bands.

    Parameters
    ----------
    rslt: dict
        Result dictionary returned by grmpy.fit().
    init_file: yaml
        Initialization file containing parameters for the estimation
        process.
    college_years: int, default is 4
        Average duration of college degree. The MTE plotted will thus
        refer to the returns per one year of college education.
    font_size: int, default is 22
        Font size of the MTE graph.
    label_size: int, default is 16
        Label size of the MTE graph
    color: str, default is "blue"
        Color of the MTE curve.
    semipar: bool, default is False
        Option to indicate the semiparametric estimation.
        If semipar is False, the parametric normal model is assumed and
        confidence intervals are computed analytically.
        Else (semipar is True), confidence bands are bootstrapped.
    nboot: int, default is 250
        Only relevant for semiparametric estimation (semipar=True).
        Number of of bootstrap iterations used to compute
        confidence intervals.
    save_plot: bool or str or PathLike or file-like object, default is False
        If False, the resulting plot is shown but not saved.
        If True, the MTE plot is saved as 'MTE_plot.png'.
        Else, if a str or Pathlike or file-like object is specified,
        the plot is saved according to *save_plot*.
        The output format is inferred from the extension ('png', 'pdf', 'svg'... etc.)
        By default, '.png' is assumed.
    """
    # Read init dict and data
    dict_ = read(init_file, semipar)
    data = read_data(dict_["ESTIMATION"]["file"])

    dict_, data = check_append_constant(init_file, dict_, data, semipar)

    if semipar is True:
        quantiles, mte, con_u, con_d = mte_and_cof_int_semipar(
            rslt, init_file, college_years, nboot)

    else:
        quantiles, mte, con_u, con_d = mte_and_cof_int_par(
            rslt, dict_, data, college_years)

    # Add confidence intervals to rslt dictionary
    rslt.update({"con_u": con_u, "con_d": con_d})

    plot_curve(mte, quantiles, con_u, con_d, font_size, label_size, color,
               save_plot)
예제 #12
0
def bootstrap(init_file, nbootstraps):
    """
    This function generates bootsrapped standard errors
    given an init_file and the number of bootsraps to be drawn.
    """
    check_presence_init(init_file)
    dict_ = read(init_file, semipar=True)

    # Process the information specified in the initialization file
    nbins, logit, bandwidth, gridsize, a, b = process_user_input(dict_)
    trim, rbandwidth, reestimate_p = process_default_input(dict_)

    # Suppress output
    show_output = False

    # Prepare empty array to store output values
    mte_boot = np.zeros([gridsize, nbootstraps])

    # Load the baseline data
    data = read_data(dict_["ESTIMATION"]["file"])

    counter = 0
    while counter < nbootstraps:
        boot_data = resample(data,
                             replace=True,
                             n_samples=len(data),
                             random_state=None)

        # Process the inputs for the decision equation
        indicator, D, Z = process_choice_data(dict_, boot_data)

        # Estimate propensity score P(z)
        ps = estimate_treatment_propensity(D, Z, logit, show_output)

        if isinstance(ps, np.ndarray):
            # Define common support and trim the data, if trim=True
            boot_data, ps = trim_support(
                dict_,
                boot_data,
                logit,
                ps,
                indicator,
                nbins,
                trim,
                reestimate_p,
                show_output,
            )

            # Estimate the observed and unobserved component of the MTE
            X, b1_b0, b0, mte_u = mte_components(dict_, boot_data, ps,
                                                 rbandwidth, bandwidth,
                                                 gridsize, a, b, show_output)

            # Calculate the MTE component that depends on X
            mte_x = np.dot(X, b1_b0).mean(axis=0)

            # Put the MTE together
            mte = mte_x + mte_u
            mte_boot[:, counter] = mte

            counter += 1

        else:
            continue

    return mte_boot
예제 #13
0
def bootstrap(init_file, nbootstraps, show_output=False):
    """
    This function generates bootsrapped standard errors
    given an init_file and the number of bootsraps to be drawn.
    """
    check_presence_init(init_file)
    dict_ = read(init_file)

    nbins = dict_["ESTIMATION"]["nbins"]
    trim = dict_["ESTIMATION"]["trim_support"]
    rbandwidth = dict_["ESTIMATION"]["rbandwidth"]
    bandwidth = dict_["ESTIMATION"]["bandwidth"]
    gridsize = dict_["ESTIMATION"]["gridsize"]
    a = dict_["ESTIMATION"]["ps_range"][0]
    b = dict_["ESTIMATION"]["ps_range"][1]

    logit = dict_["ESTIMATION"]["logit"]

    # Distribute initialization information.
    data = read_data(dict_["ESTIMATION"]["file"])

    # Prepare empty arrays to store output values
    mte_boot = np.zeros([gridsize, nbootstraps])

    counter = 0
    while counter < nbootstraps:
        boot = resample(data, replace=True, n_samples=len(data), random_state=None)

        # Process data for the semiparametric estimation.
        indicator = dict_["ESTIMATION"]["indicator"]
        D = boot[indicator].values
        Z = boot[dict_["CHOICE"]["order"]]

        # The Local Instrumental Variables (LIV) approach

        # 1. Estimate propensity score P(z)
        ps = estimate_treatment_propensity(D, Z, logit, show_output)

        if isinstance(ps, np.ndarray):  # & (np.min(ps) <= 0.3) & (np.max(ps) >= 0.7):

            # 2a. Find common support
            treated, untreated, common_support = define_common_support(
                ps, indicator, boot, nbins, show_output
            )

            # 2b. Trim the data
            if trim is True:
                boot, ps = trim_data(ps, common_support, boot)

            # 3. Double Residual Regression
            # Sort data by ps
            boot = boot.sort_values(by="ps", ascending=True)
            ps = np.sort(ps)

            X = boot[dict_["TREATED"]["order"]]
            Xp = construct_Xp(X, ps)
            Y = boot[[dict_["ESTIMATION"]["dependent"]]]

            b0, b1_b0 = double_residual_reg(ps, X, Xp, Y, rbandwidth, show_output)

            # Turn the X, Xp, and Y DataFrames into np.ndarrays
            X_arr = np.array(X)
            Xp_arr = np.array(Xp)
            Y_arr = np.array(Y).ravel()

            # 4. Compute the unobserved part of Y
            Y_tilde = Y_arr - np.dot(X_arr, b0) - np.dot(Xp_arr, b1_b0)

            # 5. Estimate mte_u, the unobserved component of the MTE,
            # through a locally quadratic regression
            quantiles, mte_u = locpoly(ps, Y_tilde, 1, 2, bandwidth, gridsize, a, b)

            # 6. construct MTE
            # Calculate the MTE component that depends on X
            mte_x = np.dot(X, b1_b0).mean(axis=0)

            # Put the MTE together
            mte = mte_x + mte_u

            mte_boot[:, counter] = mte

            counter += 1

        else:
            continue

    return mte_boot