Пример #1
0
def test8():
    """We want to able to smoothly switch between generating and printing random initialization
    files.
    """
    for _ in range(10):
        generate_random_dict()
        dict_1 = read('test.grmpy.ini')
        print_dict(dict_1)
        dict_2 = read('test.grmpy.ini')
        np.testing.assert_equal(dict_1, dict_2)
Пример #2
0
def test9():
    """This test ensures that the random initialization file generating process, the read in process
    and the simulation process works if the constraints function allows for different number of co-
    variates for each treatment state and the occurence of cost-benefit shifters."""
    for i in range(5):
        constr = dict()
        constr['DETERMINISTIC'], constr['AGENT'], constr[
            'STATE_DIFF'] = False, 1000, True
        constr['OVERLAP'] = True
        generate_random_dict(constr)
        read('test.grmpy.ini')
        simulate('test.grmpy.ini')
        estimate('test.grmpy.ini')

    cleanup()
Пример #3
0
def simulate(init_file):
    """This function simulates a user-specified version of the generalized Roy model."""
    init_dict = read(init_file)

    # Distribute information
    seed = init_dict['SIMULATION']['seed']

    # Set random seed to ensure recomputabiltiy
    np.random.seed(seed)

    # Simulate unobservables of the model
    U, V = simulate_unobservables(init_dict)

    # Simulate observables of the model
    X = simulate_covariates(init_dict, 'TREATED')
    Z = simulate_covariates(init_dict, 'COST')

    # Simulate endogeneous variables of the model
    Y, D, Y_1, Y_0 = simulate_outcomes(init_dict, X, Z, U)

    # Write output file
    df = write_output(init_dict, Y, D, X, Z, Y_1, Y_0, U, V)

    # Calculate Criteria function value
    if init_dict['DETERMINISTIC'] is False:
        x0 = start_values(init_dict, df, 'init')
        init_dict['AUX']['criteria_value'] = calculate_criteria(
            init_dict, df, x0)

    # Print Log file
    print_info(init_dict, df)

    return df
Пример #4
0
def test5():
    """The tests checks if the simulation process works even if the covariance between U1 and V
    and U0 and V is equal. Further the test ensures that the mte_information function returns
    the same value for each quantile.
    """
    for _ in range(10):
        generate_random_dict()
        init_dict = read('test.grmpy.ini')

        # We impose that the covariance between the random components of the potential
        # outcomes and the random component determining choice is identical.
        init_dict['DIST']['all'][2] = init_dict['DIST']['all'][4]

        # Distribute information
        coeffs_untreated = init_dict['UNTREATED']['all']
        coeffs_treated = init_dict['TREATED']['all']

        # Construct auxiliary information
        cov = construct_covariance_matrix(init_dict)

        df = simulate('test.grmpy.ini')
        x = df.filter(regex=r'^X\_', axis=1)
        q = [0.01] + list(np.arange(0.05, 1, 0.05)) + [0.99]
        mte = mte_information(coeffs_treated, coeffs_untreated, cov, q, x)

        # We simply test that there is a single unique value for the marginal treatment effect.
        np.testing.assert_equal(len(set(mte)), 1)
Пример #5
0
def test4():
    """The fifth test tests the random init file generating process and the  import process. It
    generates an random init file, imports it again and compares the entries in the both dictio-
    naries.
    """
    for _ in range(10):
        gen_dict = generate_random_dict()
        init_file_name = gen_dict['SIMULATION']['source']
        print_dict(gen_dict, init_file_name)
        imp_dict = read(init_file_name + '.grmpy.ini')

        for key_ in ['TREATED', 'UNTREATED', 'COST', 'DIST']:
            np.testing.assert_array_almost_equal(gen_dict[key_]['coeff'], imp_dict[key_]['all'],
                                                 decimal=4)
            if key_ in ['TREATED', 'UNTREATED', 'COST']:
                for i in range(len(gen_dict[key_]['types'])):
                    if isinstance(gen_dict[key_]['types'][i], str):
                        if not gen_dict[key_]['types'][i] == imp_dict[key_]['types'][i]:
                            raise AssertionError()
                    elif isinstance(gen_dict[key_]['types'][i], list):
                        if not gen_dict[key_]['types'][i][0] == imp_dict[key_]['types'][i][0]:
                            raise AssertionError()
                        np.testing.assert_array_almost_equal(
                            gen_dict[key_]['types'][i][1], imp_dict[key_]['types'][i][1], 4)

        for key_ in ['source', 'agents', 'seed']:
            if not gen_dict['SIMULATION'][key_] == imp_dict['SIMULATION'][key_]:
                raise AssertionError()
Пример #6
0
def create_data():
    """This function creates the a data set based on the results from Caineiro 2011."""
    # Read in initialization file and the data set
    init_dict = read("reliability.grmpy.yml")
    df = pd.read_pickle("aer-simulation-mock.pkl")

    # Distribute information
    indicator, dep = (
        init_dict["ESTIMATION"]["indicator"],
        init_dict["ESTIMATION"]["dependent"],
    )
    label_out = init_dict["TREATED"]["order"]
    label_choice = init_dict["CHOICE"]["order"]
    seed = init_dict["SIMULATION"]["seed"]

    # Set random seed to ensure recomputabiltiy
    np.random.seed(seed)

    # Simulate unobservables
    U = simulate_unobservables(init_dict)

    df["U1"], df["U0"], df["V"] = U["U1"], U["U0"], U["V"]
    # Simulate choice and output
    df[dep + "1"] = np.dot(df[label_out], init_dict["TREATED"]["params"]) + df["U1"]
    df[dep + "0"] = np.dot(df[label_out], init_dict["UNTREATED"]["params"]) + df["U0"]
    df[indicator] = np.array(
        np.dot(df[label_choice], init_dict["CHOICE"]["params"]) - df["V"] > 0
    ).astype(int)
    df[dep] = df[indicator] * df[dep + "1"] + (1 - df[indicator]) * df[dep + "0"]

    # Save the data
    df.to_pickle("aer-simulation-mock.pkl")

    return df
Пример #7
0
def test1():
    """The first test tests whether the relationships in the simulated datasets are appropriate
    in a deterministic and an un-deterministic setting.
    """
    constr = dict()
    for case in ['deterministic', 'undeterministic']:
        if case == 'deterministic':
            constr['DETERMINISTIC'] = True
        else:
            constr['DETERMINISTIC'] = False
        for _ in range(10):
            generate_random_dict(constr)
            df = simulate('test.grmpy.ini')
            dict_ = read('test.grmpy.ini')
            x_treated = df[[
                dict_['varnames'][i - 1] for i in dict_['TREATED']['order']
            ]]
            y_treated = pd.DataFrame.sum(dict_['TREATED']['all'] * x_treated,
                                         axis=1) + df.U1
            x_untreated = df[[
                dict_['varnames'][i - 1] for i in dict_['UNTREATED']['order']
            ]]
            y_untreated = pd.DataFrame.sum(
                dict_['UNTREATED']['all'] * x_untreated, axis=1) + df.U0

            np.testing.assert_array_almost_equal(df.Y1, y_treated, decimal=5)
            np.testing.assert_array_almost_equal(df.Y0, y_untreated, decimal=5)
            np.testing.assert_array_equal(df.Y[df.D == 1], df.Y1[df.D == 1])
            np.testing.assert_array_equal(df.Y[df.D == 0], df.Y0[df.D == 0])
Пример #8
0
def test5():
    """The tests checks if the simulation process works even if the covariance between
    U1 and V and U0 and V is equal. Further the test ensures that the mte_information
    function returns the same value for each quantile.
    """
    for _ in range(10):
        generate_random_dict()
        init_dict = read("test.grmpy.yml")

        # We impose that the covariance between the random components of the potential
        # outcomes and the random component determining choice is identical.
        init_dict["DIST"]["params"][2] = init_dict["DIST"]["params"][4]

        # Distribute information
        coeffs_untreated = init_dict["UNTREATED"]["params"]
        coeffs_treated = init_dict["TREATED"]["params"]

        # Construct auxiliary information
        cov = construct_covariance_matrix(init_dict)

        df = simulate("test.grmpy.yml")

        x = df[list(
            set(init_dict["TREATED"]["order"] +
                init_dict["UNTREATED"]["order"]))]

        q = [0.01] + list(np.arange(0.05, 1, 0.05)) + [0.99]
        mte = mte_information(coeffs_treated, coeffs_untreated, cov, q, x,
                              init_dict)

        # We simply test that there is a single unique value for the marginal treatment
        #  effect.
        np.testing.assert_equal(len(set(mte)), 1)
Пример #9
0
def check_vault(num_tests=100):
    """This function checks the complete regression vault that is distributed as part of
    the package.
    """
    fname = (os.path.dirname(grmpy.__file__) +
             "/test/resources/old_regression_vault.grmpy.json")
    tests = json.load(open(fname))

    if num_tests > len(tests):
        print("The specified number of evaluations is larger than the number"
              " of entries in the regression_test vault.\n"
              "Therefore the test runs the complete test battery.")
    else:
        tests = [tests[i] for i in np.random.choice(len(tests), num_tests)]

    for test in tests:
        stat, dict_, criteria = test
        print_dict(dict_transformation(dict_))
        init_dict = read("test.grmpy.yml")
        df = simulate("test.grmpy.yml")
        _, X1, X0, Z1, Z0, Y1, Y0 = process_data(df, init_dict)
        x0 = start_values(init_dict, df, "init")
        criteria_ = calculate_criteria(init_dict, X1, X0, Z1, Z0, Y1, Y0, x0)
        np.testing.assert_almost_equal(criteria_, criteria)
        np.testing.assert_almost_equal(np.sum(df.sum()), stat)
        cleanup("regression")
Пример #10
0
def simulate(init_file):
    """This function simulates a user-specified version of the generalized Roy model."""
    init_dict = read(init_file)

    # We perform some basic consistency checks regarding the user's request.
    check_initialization_dict(init_dict)

    # Distribute information
    seed = init_dict['SIMULATION']['seed']

    # Set random seed to ensure recomputabiltiy
    np.random.seed(seed)

    # Simulate unobservables of the model
    U, V = simulate_unobservables(init_dict)

    # Simulate observables of the model
    X = simulate_covariates(init_dict)

    # Simulate endogeneous variables of the model
    Y, D, Y_1, Y_0 = simulate_outcomes(init_dict, X, U, V)

    # Write output file
    df = write_output(init_dict, Y, D, X, Y_1, Y_0, U, V)

    # Calculate Criteria function value
    if not init_dict['DETERMINISTIC']:
        x0 = start_values(init_dict, df, 'init')
        init_dict['AUX']['criteria_value'] = calculate_criteria(
            init_dict, df, x0)

    # Print Log file
    print_info(init_dict, df)

    return df
Пример #11
0
def create_data():
    """This function creates the a data set based on the results from Caineiro 2011."""
    # Read in initialization file and the data set
    init_dict = read('reliability.grmpy.ini')
    df = pd.read_pickle('aer-simulation-mock.pkl')

    # Distribute information
    indicator, dep = init_dict['ESTIMATION']['indicator'], init_dict['ESTIMATION']['dependent']
    label_out = [init_dict['varnames'][j - 1] for j in init_dict['TREATED']['order']]
    label_choice = [init_dict['varnames'][j - 1] for j in init_dict['CHOICE']['order']]
    seed = init_dict['SIMULATION']['seed']

    # Set random seed to ensure recomputabiltiy
    np.random.seed(seed)

    # Simulate unobservables
    U, V = simulate_unobservables(init_dict)

    df['U1'], df['U0'], df['V'] = U[:, 0], U[:, 1], V

    # Simulate choice and output
    df[dep + '1'] = np.dot(df[label_out], init_dict['TREATED']['all']) + df['U1']
    df[dep + '0'] = np.dot(df[label_out], init_dict['UNTREATED']['all']) + df['U0']
    df[indicator] = np.array(
        np.dot(df[label_choice], init_dict['CHOICE']['all']) - df['V'] > 0).astype(int)
    df[dep] = df[indicator] * df[dep + '1'] + (1 - df[indicator]) * df[dep + '0']

    # Save the data
    df.to_pickle('aer-simulation-mock.pkl')

    return df
Пример #12
0
def test1():
    """The first test tests whether the relationships in the simulated datasets are
    appropriate in a deterministic and an un-deterministic setting.
    """
    constr = dict()
    for case in ["deterministic", "undeterministic"]:
        if case == "deterministic":
            constr["DETERMINISTIC"] = True
        else:
            constr["DETERMINISTIC"] = True
        for _ in range(10):
            generate_random_dict(constr)
            df = simulate("test.grmpy.yml")
            dict_ = read("test.grmpy.yml")
            x_treated = df[dict_["TREATED"]["order"]]
            y_treated = (pd.DataFrame.sum(
                dict_["TREATED"]["params"] * x_treated, axis=1) + df.U1)
            x_untreated = df[dict_["UNTREATED"]["order"]]
            y_untreated = (pd.DataFrame.sum(
                dict_["UNTREATED"]["params"] * x_untreated, axis=1) + df.U0)

            np.testing.assert_array_almost_equal(df.Y1, y_treated, decimal=5)
            np.testing.assert_array_almost_equal(df.Y0, y_untreated, decimal=5)
            np.testing.assert_array_equal(df.Y[df.D == 1], df.Y1[df.D == 1])
            np.testing.assert_array_equal(df.Y[df.D == 0], df.Y0[df.D == 0])
Пример #13
0
def fit(init_file, semipar=False):
    """This function estimates the MTE based on a parametric normal model
    or, alternatively, via the semiparametric method of
    local instrumental variables (LIV).

    Parameters
    ----------
    init_file: yaml
        Initialization file containing parameters for the estimation
        process.

    Returns
    ------
    rslt: dict
        Result dictionary containing
        - quantiles
        - mte
        - mte_x
        - mte_u
        - mte_min
        - mte_max
        - X
        - b1
        - b0
    """

    # Load the estimation file
    dict_ = read(init_file, semipar)

    # Perform some consistency checks given the user's request
    check_presence_estimation_dataset(dict_)
    check_est_init_dict(dict_)

    # Semiparametric LIV Model
    if semipar:
        # Distribute initialization information.
        data = read_data(dict_["ESTIMATION"]["file"])
        dict_, data = check_append_constant(init_file,
                                            dict_,
                                            data,
                                            semipar=True)

        rslt = semipar_fit(dict_, data)

    # Parametric Normal Model
    else:
        # Perform some extra checks
        check_par_init_file(dict_)

        # Distribute initialization information.
        data = read_data(dict_["ESTIMATION"]["file"])
        dict_, data = check_append_constant(init_file,
                                            dict_,
                                            data,
                                            semipar=False)

        rslt = par_fit(dict_, data)

    return rslt
Пример #14
0
def par_fit(init_file):
    """The function estimates the coefficients of the simulated data set."""
    check_presence_init(init_file)

    dict_ = read(init_file)
    np.random.seed(dict_["SIMULATION"]["seed"])

    # We perform some basic consistency checks regarding the user's request.
    check_presence_estimation_dataset(dict_)
    #check_initialization_dict2(dict_)
    #check_init_file(dict_)

    # Distribute initialization information.
    data = read_data(dict_["ESTIMATION"]["file"])
    num_treated = dict_["AUX"]["num_covars_treated"]
    num_untreated = num_treated + dict_["AUX"]["num_covars_untreated"]

    _, X1, X0, Z1, Z0, Y1, Y0 = process_data(data, dict_)

    if dict_["ESTIMATION"]["maxiter"] == 0:
        option = "init"
    else:
        option = dict_["ESTIMATION"]["start"]

    # Read data frame

    # define starting values
    x0 = start_values(dict_, data, option)
    opts, method = optimizer_options(dict_)
    dict_["AUX"]["criteria"] = calculate_criteria(dict_, X1, X0, Z1, Z0, Y1,
                                                  Y0, x0)
    dict_["AUX"]["starting_values"] = backward_transformation(x0)
    rslt_dict = bfgs_dict()
    if opts["maxiter"] == 0:
        rslt = adjust_output(None, dict_, x0, X1, X0, Z1, Z0, Y1, Y0,
                             rslt_dict)
    else:
        opt_rslt = minimize(
            minimizing_interface,
            x0,
            args=(dict_, X1, X0, Z1, Z0, Y1, Y0, num_treated, num_untreated,
                  rslt_dict),
            method=method,
            options=opts,
        )
        rslt = adjust_output(opt_rslt, dict_, opt_rslt["x"], X1, X0, Z1, Z0,
                             Y1, Y0, rslt_dict)
    # Print Output files
    print_logfile(dict_, rslt)

    if "comparison" in dict_["ESTIMATION"].keys():
        if dict_["ESTIMATION"]["comparison"] == 0:
            pass
        else:
            write_comparison(data, rslt)
    else:
        write_comparison(data, rslt)

    return rslt
Пример #15
0
def get_effect_grmpy(file):
    """This function simply returns the ATE of the data set."""
    dict_ = read("reliability.grmpy.yml")
    df = pd.read_pickle("aer-simulation-mock.pkl")
    beta_diff = dict_["TREATED"]["params"] - dict_["UNTREATED"]["params"]
    covars = dict_["TREATED"]["order"]
    ATE = np.dot(np.mean(df[covars]), beta_diff)

    return ATE
Пример #16
0
def get_effect_grmpy(file):
    """This function simply returns the ATE of the data set."""
    dict_ = read(file)
    df = pd.read_pickle(dict_["SIMULATION"]["source"] + ".grmpy.pkl")
    beta_diff = dict_["TREATED"]["params"] - dict_["UNTREATED"]["params"]
    covars = dict_["TREATED"]["order"]
    ATE = np.dot(np.mean(df[covars]), beta_diff)

    return ATE
Пример #17
0
def get_effect_grmpy(file):
    """This function simply returns the ATE of the data set."""
    dict_ = read('reliability.grmpy.ini')
    df = pd.read_pickle('aer-simulation-mock.pkl')
    beta_diff = dict_['TREATED']['all'] - dict_['UNTREATED']['all']
    covars = [dict_['varnames'][j - 1] for j in dict_['TREATED']['order']]
    ATE = np.dot(np.mean(df[covars]), beta_diff)

    return ATE
Пример #18
0
def test9():
    """This test ensures that the random initialization file generating process, the
    read in process and the simulation process works if the constraints function allows
    for different number of covariates for each treatment state and the occurence of
    cost-benefit shifters."""
    for _ in range(5):
        constr = dict()
        constr["DETERMINISTIC"], constr["AGENT"], constr["STATE_DIFF"] = (
            False,
            1000,
            True,
        )
        constr["OVERLAP"] = True
        generate_random_dict(constr)
        read("test.grmpy.yml")
        simulate("test.grmpy.yml")
        fit("test.grmpy.yml")

    cleanup()
Пример #19
0
def refactor_results(dict_, file):
    pseudo = read(file)

    for key in ['TREATED', 'UNTREATED', 'COST', 'DIST']:
        if key == 'DIST':
            pseudo['DIST']['coeff'] = dict_['AUX']['x_internal'][-6:]
        else:
            pseudo[key]['coeff'] = dict_[key]['all'].tolist()
            del pseudo[key]['all']
    print_dict(pseudo, 'test')
Пример #20
0
def _create_data(file):
    """
    This function creates the data set used in the Monte Carlo simulation.

    In particular, the unobservables, choice, and output are simulated for
    each indiviudal based on the grmpy initialization file.
    Thereafter, the data is both returned as a pandas.DataFrame
    and saved locally in pickle format.

    Parameters
    ----------
    file: yaml
        grmpy initialization file.

    Returns
    -------
    df: pandas.DataFrame
        DataFrame
    """
    # Read in initialization file and the data set
    init_dict = read(file)
    df = pd.read_pickle(init_dict["SIMULATION"]["source"] + ".grmpy.pkl")

    # Distribute information
    indicator, dep = (
        init_dict["ESTIMATION"]["indicator"],
        init_dict["ESTIMATION"]["dependent"],
    )
    label_out = init_dict["TREATED"]["order"]
    label_choice = init_dict["CHOICE"]["order"]
    seed = init_dict["SIMULATION"]["seed"]

    # Set random seed to ensure recomputabiltiy
    np.random.seed(seed)

    # Simulate unobservables
    U = simulate_unobservables(init_dict)

    df["U1"], df["U0"], df["V"] = U["U1"], U["U0"], U["V"]

    # Simulate choice and output
    df[dep +
       "1"] = np.dot(df[label_out], init_dict["TREATED"]["params"]) + df["U1"]
    df[dep + "0"] = np.dot(df[label_out],
                           init_dict["UNTREATED"]["params"]) + df["U0"]
    df[indicator] = np.array(
        np.dot(df[label_choice], init_dict["CHOICE"]["params"]) -
        df["V"] > 0).astype(int)
    df[dep] = df[indicator] * df[dep + "1"] + (1 - df[indicator]) * df[dep +
                                                                       "0"]

    # Save the data
    df.to_pickle(init_dict["SIMULATION"]["source"] + ".grmpy.pkl")

    return df
Пример #21
0
def test4():
    """The fifth test tests the random init file generating process and the import
    process. It generates an random init file, imports it again and compares the entries
    in  both dictionaries.
    """
    for _ in range(10):
        gen_dict = generate_random_dict()
        init_file_name = gen_dict["SIMULATION"]["source"]
        print_dict(gen_dict, init_file_name)
        imp_dict = read(init_file_name + ".grmpy.yml")
        dicts = [gen_dict, imp_dict]
        for section in ["TREATED", "UNTREATED", "CHOICE", "DIST"]:
            np.testing.assert_array_almost_equal(gen_dict[section]["params"],
                                                 imp_dict[section]["params"],
                                                 decimal=4)
            if section in ["TREATED", "UNTREATED", "CHOICE"]:
                for dict_ in dicts:
                    if not dict_[section]["order"] == dict_[section]["order"]:
                        raise AssertionError()
                    if len(dict_[section]["order"]) != len(
                            set(dict_[section]["order"])):
                        raise AssertionError()
                    if dict_[section]["order"][0] != "X1":
                        raise AssertionError()

        for variable in gen_dict["VARTYPES"].keys():
            if variable not in imp_dict["VARTYPES"].keys():
                raise AssertionError()

            if gen_dict["VARTYPES"][variable] != imp_dict["VARTYPES"][variable]:
                raise AssertionError

        if gen_dict["VARTYPES"]["X1"] != "nonbinary":
            raise AssertionError

        for subkey in ["source", "agents", "seed"]:
            if not gen_dict["SIMULATION"][subkey] == imp_dict["SIMULATION"][
                    subkey]:
                raise AssertionError()

        for subkey in [
                "agents",
                "file",
                "optimizer",
                "start",
                "maxiter",
                "dependent",
                "indicator",
                "comparison",
                "output_file",
        ]:
            if not gen_dict["ESTIMATION"][subkey] == imp_dict["ESTIMATION"][
                    subkey]:
                raise AssertionError()
Пример #22
0
def monte_carlo(file, grid_points):
    """This function estimates the ATE for a sample with different correlation structures between U1
     and V. Two different strategies for (OLS,LATE) are implemented.
     """

    # Define a dictionary with a key for each estimation strategy
    effects = {}
    for key_ in ['grmpy', 'ols', 'true']:
        effects[key_] = []

    # Loop over different correlations between V and U_1
    for rho in np.linspace(0.00, 0.99, grid_points):

        # Readjust the initialization file values to add correlation
        model_spec = read(file)
        sim_spec = read('reliability.grmpy.ini')
        X = [sim_spec['varnames'][j - 1] for j in sim_spec['TREATED']['order']]
        update_correlation_structure(model_spec, rho)

        # Simulate a Data set and specify exogeneous and endogeneous variables
        df_mc = create_data()
        endog, exog, exog_ols = df_mc['wage'], df_mc[X], df_mc[['state'] + X]

        # Calculate true average treatment effect
        ATE = np.mean(df_mc['wage1'] - df_mc['wage0'])
        effects['true'] += [ATE]

        # Estimate  via grmpy
        rslt = estimate('reliability.grmpy.ini')
        beta_diff = rslt['TREATED']['all'] - rslt['UNTREATED']['all']
        stat = np.dot(np.mean(exog), beta_diff)

        effects['grmpy'] += [stat]

        # Estimate via OLS
        ols = sm.OLS(endog, exog_ols).fit()
        stat = ols.params[0]

        effects['ols'] += [stat]

    return effects
Пример #23
0
def monte_carlo(file, grid_points):
    """This function estimates the ATE for a sample with different correlation structures between U1
    and V. Two different strategies for (OLS,LATE) are implemented.
    """

    # Define a dictionary with a key for each estimation strategy
    effects = {}
    for key_ in ["grmpy", "ols", "true"]:
        effects[key_] = []

    # Loop over different correlations between V and U_1
    for rho in np.linspace(0.00, 0.99, grid_points):

        # Readjust the initialization file values to add correlation
        model_spec = read(file)
        sim_spec = read("reliability.grmpy.yml")
        X = sim_spec["TREATED"]["order"]
        update_correlation_structure(model_spec, rho)

        # Simulate a Data set and specify exogeneous and endogeneous variables
        df_mc = create_data()
        endog, exog, exog_ols = df_mc["wage"], df_mc[X], df_mc[["state"] + X]

        # Calculate true average treatment effect
        ATE = np.mean(df_mc["wage1"] - df_mc["wage0"])
        effects["true"] += [ATE]

        # Estimate  via grmpy
        rslt = fit("reliability.grmpy.yml")
        beta_diff = rslt["TREATED"]["params"] - rslt["UNTREATED"]["params"]
        stat = np.dot(np.mean(exog), beta_diff)

        effects["grmpy"] += [stat]

        # Estimate via OLS
        ols = sm.OLS(endog, exog_ols).fit()
        stat = ols.params[0]

        effects["ols"] += [stat]

    return effects
Пример #24
0
def simulate_test_data():
    """
    Simulate test dict_ and data.
    """
    fname = TEST_RESOURCES_DIR + "/tutorial.grmpy.yml"
    data = simulate(fname)
    dict_ = read(fname)
    dict_, data = check_append_constant(
        TEST_RESOURCES_DIR + "/tutorial.grmpy.yml", dict_, data, semipar=True
    )

    return dict_, data
Пример #25
0
def plot_est_mte(rslt, file):
    """This function calculates the marginal treatment effect for different quartiles of the
    unobservable V. ased on the calculation results."""

    init_dict = read(file)
    data_frame = pd.read_pickle(init_dict['ESTIMATION']['file'])

    # Define the Quantiles and read in the original results
    quantiles = [0.0001] + np.arange(0.01, 1., 0.01).tolist() + [0.9999]
    mte_ = json.load(open('data/mte_original.json', 'r'))
    mte_original = mte_[1]
    mte_original_d = mte_[0]
    mte_original_u = mte_[2]

    # Calculate the MTE and confidence intervals
    mte = calculate_mte(rslt, init_dict, data_frame, quantiles)
    mte = [i / 4 for i in mte]
    mte_up, mte_d = calculate_cof_int(rslt, init_dict, data_frame, mte,
                                      quantiles)

    # Plot both curves
    ax = plt.figure(figsize=(17.5, 10)).add_subplot(111)

    ax.set_ylabel(r"$B^{MTE}$", fontsize=24)
    ax.set_xlabel("$u_D$", fontsize=24)
    ax.tick_params(axis='both', which='major', labelsize=18)
    ax.plot(quantiles, mte, label='grmpy $B^{MTE}$', color='blue', linewidth=4)
    ax.plot(quantiles, mte_up, color='blue', linestyle=':', linewidth=3)
    ax.plot(quantiles, mte_d, color='blue', linestyle=':', linewidth=3)
    ax.plot(quantiles,
            mte_original,
            label='original$B^{MTE}$',
            color='orange',
            linewidth=4)
    ax.plot(quantiles,
            mte_original_d,
            color='orange',
            linestyle=':',
            linewidth=3)
    ax.plot(quantiles,
            mte_original_u,
            color='orange',
            linestyle=':',
            linewidth=3)
    ax.set_ylim([-0.41, 0.51])
    ax.set_xlim([-0.005, 1.005])

    blue_patch = mpatches.Patch(color='blue', label='original $B^{MTE}$')
    orange_patch = mpatches.Patch(color='orange', label='grmpy $B^{MTE}$')
    plt.legend(handles=[blue_patch, orange_patch], prop={'size': 16})
    plt.show()

    return mte
Пример #26
0
def plot_common_support(init_file, nbins, fs=24, output=False):
    """This function plots histograms of the treated and untreated population
    to assess the common support of the propensity score"""
    dict_ = read(init_file)

    # Distribute initialization information.
    data = read_data(dict_["ESTIMATION"]["file"])

    # Process data for the semiparametric estimation.
    indicator = dict_["ESTIMATION"]["indicator"]
    D = data[indicator].values
    Z = data[dict_["CHOICE"]["order"]]
    logit = dict_["ESTIMATION"]["logit"]

    # estimate propensity score
    ps = estimate_treatment_propensity(D, Z, logit, show_output=False)

    data["ps"] = ps

    treated = data[[indicator, "ps"]][data[indicator] == 1].values
    untreated = data[[indicator, "ps"]][data[indicator] == 0].values

    treated = treated[:, 1].tolist()
    untreated = untreated[:, 1].tolist()

    # Make the histogram using a list of lists
    fig = plt.figure(figsize=(17.5, 10))
    hist = plt.hist(
        [treated, untreated],
        bins=nbins,
        weights=[
            np.ones(len(treated)) / len(treated),
            np.ones(len(untreated)) / len(untreated),
        ],
        density=0,
        alpha=0.55,
        label=["Treated", "Untreated"],
    )

    # Plot formatting
    plt.tick_params(axis="both", labelsize=14)
    plt.legend(loc="upper right", prop={"size": 14})
    plt.xticks(np.arange(0, 1.1, step=0.1))
    plt.grid(axis="y", alpha=0.25)
    plt.xlabel("$P$", fontsize=fs)
    plt.ylabel("$f(P)$", fontsize=fs)
    # plt.title('Support of $P(\hat{Z})$ for $D=1$ and $D=0$', fontsize=fs)

    if not output is False:
        plt.savefig(output, dpi=300)

    fig.show()
def plot_rslts(rslt, file):
    init_dict = read(file)
    data_frame = pd.read_pickle(init_dict["ESTIMATION"]["file"])

    # Define the Quantiles and read in the original results
    mte_ = json.load(open("resources/mte_original.json"))
    mte_original = mte_[1]
    mte_original_d = mte_[0]
    mte_original_u = mte_[2]

    # Calculate the MTE and confidence intervals
    quantiles, mte, mte_up, mte_d = mte_and_cof_int_par(rslt, data_frame, 4)

    # Plot both curves
    ax = plt.figure().add_subplot(111)

    ax.set_ylabel(r"$MTE$")
    ax.set_xlabel("$u_D$")
    ax.tick_params(axis="both", which="major", labelsize=18)
    ax.plot(quantiles, mte, label="grmpy MTE", color="blue", linewidth=4)
    ax.plot(quantiles, mte_up, color="blue", linestyle=":", linewidth=3)
    ax.plot(quantiles, mte_d, color="blue", linestyle=":", linewidth=3)
    ax.plot(quantiles,
            mte_original,
            label="original${MTE}$",
            color="orange",
            linewidth=4)
    ax.plot(quantiles,
            mte_original_d,
            color="orange",
            linestyle=":",
            linewidth=3)
    ax.plot(quantiles,
            mte_original_u,
            color="orange",
            linestyle=":",
            linewidth=3)
    ax.xaxis.set_ticks(np.arange(0, 1.1, step=0.1))
    ax.yaxis.set_ticks(np.arange(-0.5, 0.5, step=0.1))

    ax.set_ylim([-0.37, 0.47])
    ax.set_xlim([0, 1])
    ax.margins(x=0.003)
    ax.margins(y=0.03)

    blue_patch = mpatches.Patch(color="blue", label="original $MTE$")
    orange_patch = mpatches.Patch(color="orange", label="replicated $MTE$")
    plt.legend(handles=[blue_patch, orange_patch], prop={"size": 16})
    plt.savefig(OUTPUT_DIR +
                "/fig-marginal-benefit-parametric-replication.png",
                dpi=300)
Пример #28
0
def estimate(init_file):
    """The function estimates the coefficients of the simulated data set."""
    check_presence_init(init_file)

    dict_ = read(init_file)
    np.random.seed(dict_['SIMULATION']['seed'])

    # We perform some basic consistency checks regarding the user's request.
    check_presence_estimation_dataset(dict_)
    check_initialization_dict(dict_)
    check_init_file(dict_)

    # Distribute initialization information.
    data_file = dict_['ESTIMATION']['file']

    if dict_['ESTIMATION']['maxiter'] == 0:
        option = 'init'
    else:
        option = dict_['ESTIMATION']['start']

    # Read data frame
    data = read_data(data_file)

    # define starting values
    x0 = start_values(dict_, data, option)
    opts, method = optimizer_options(dict_)
    dict_['AUX']['criteria'] = calculate_criteria(dict_, data, x0)
    dict_['AUX']['starting_values'] = backward_transformation(x0)
    rslt_dict = bfgs_dict()
    if opts['maxiter'] == 0:
        rslt = adjust_output(None, dict_, x0, data, rslt_dict)
    else:
        opt_rslt = minimize(minimizing_interface,
                            x0,
                            args=(dict_, data, rslt_dict),
                            method=method,
                            options=opts)
        rslt = adjust_output(opt_rslt, dict_, opt_rslt['x'], data, rslt_dict)
    # Print Output files
    print_logfile(dict_, rslt)

    if 'comparison' in dict_['ESTIMATION'].keys():
        if dict_['ESTIMATION']['comparison'] == 0:
            pass
        else:
            write_comparison(dict_, data, rslt)
    else:
        write_comparison(dict_, data, rslt)

    return rslt
Пример #29
0
def parametric_mte(rslt, file):
    """This function calculates the marginal treatment effect for different quartiles
    of the unobservable V based on the calculation results."""
    init_dict = read(file)
    data_frame = pd.read_pickle(init_dict["ESTIMATION"]["file"])

    # Define quantiles and read in the original results
    quantiles = [0.0001] + np.arange(0.01, 1.0, 0.01).tolist() + [0.9999]

    # Calculate the MTE and confidence intervals
    mte = calculate_mte(rslt, data_frame, quantiles)
    mte_up, mte_d = calculate_cof_int(rslt, init_dict, data_frame, mte, quantiles)

    return quantiles, mte, mte_up, mte_d
Пример #30
0
def test2():
    """The third test  checks whether the relationships hold if the coefficients are zero in
    different setups.
    """
    for _ in range(10):
        for i in ['ALL', 'TREATED', 'UNTREATED', 'COST', 'TREATED & UNTREATED']:
            constr = constraints(probability=0.0)
            dict_ = generate_random_dict(constr)

            if i == 'ALL':
                for key_ in ['TREATED', 'UNTREATED', 'COST']:
                    dict_[key_]['coeff'] = np.array([0.] * len(dict_[key_]['coeff']))
            elif i == 'TREATED & UNTREATED':
                for key_ in ['TREATED', 'UNTREATED']:
                    dict_[key_]['coeff'] = np.array([0.] * len(dict_[key_]['coeff']))
            else:
                dict_[i]['coeff'] = np.array([0.] * len(dict_[i]['coeff']))

            print_dict(dict_)
            dict_ = read('test.grmpy.ini')
            df = simulate('test.grmpy.ini')
            x = df.filter(regex=r'^X\_', axis=1)

            if i == 'ALL':
                np.testing.assert_array_equal(df.Y1, df.U1)
                np.testing.assert_array_equal(df.Y0, df.U0)
            elif i == 'TREATED & UNTREATED':
                np.testing.assert_array_equal(df.Y1, df.U1)
                np.testing.assert_array_equal(df.Y0, df.U0)
                np.testing.assert_array_equal(df.Y[df.D == 1], df.U1[df.D == 1])
                np.testing.assert_array_equal(df.Y[df.D == 0], df.U0[df.D == 0])
            elif i == 'TREATED':
                y_untreated = pd.DataFrame.sum(dict_['UNTREATED']['all'] * x, axis=1) + df.U0
                np.testing.assert_array_almost_equal(df.Y0, y_untreated, decimal=5)
                np.testing.assert_array_equal(df.Y1, df.U1)

            elif i == 'UNTREATED':
                y_treated = pd.DataFrame.sum(dict_['TREATED']['all'] * x, axis=1) + df.U1
                np.testing.assert_array_almost_equal(df.Y1, y_treated, decimal=5)
                np.testing.assert_array_equal(df.Y0, df.U0)
            else:
                y_treated = pd.DataFrame.sum(dict_['TREATED']['all'] * x, axis=1) + df.U1
                y_untreated = pd.DataFrame.sum(dict_['UNTREATED']['all'] * x, axis=1) + df.U0
                np.testing.assert_array_almost_equal(df.Y1, y_treated, decimal=5)
                np.testing.assert_array_almost_equal(df.Y0, y_untreated, decimal=5)

            np.testing.assert_array_equal(df.Y[df.D == 1], df.Y1[df.D == 1])
            np.testing.assert_array_equal(df.Y[df.D == 0], df.Y0[df.D == 0])
            np.testing.assert_array_almost_equal(df.V, (df.UC - df.U1 + df.U0))