Пример #1
0
def random_t(y, x1, V, N_0, N_1, p_0, p_1, seed, D):
    # Set random number generator's seed
    np.random.seed(seed)

    # Calculate sample size
    N = y.shape[0]

    # Generate treatment indicator
    # Draw random normals
    W = np.random.normal(size=N)

    # Replace the treatment in small villages as 1 with probability p_sm
    W[~V] = (W[~V].argsort() + 1 <= p_0 * N_0)

    # Replace the treatment in large villages as 1 with probability p_lg
    W[V] = (W[V].argsort() + 1 <= p_1 * N_1)

    # Make the treatment indicator into a proper vector
    W = np.array(W, ndmin=2).transpose()

    # Generate RHS data
    X = np.concatenate((x1, W, D), axis=1)

    # Estimate regression, get coefficients and t-statistics
    b, _, t = ols(y, X, cov_est='hmsd', get_p=False)

    # Return coefficient and t-statistic for treatment dummy
    return t[1, 0], b[1, 0]
Пример #2
0
def random_t_cls(N, y, x1, V_1, Vid, Vid_0, Vid_1, J_0, J_1, N_0, N_1, p_0,
                 p_1, seed, D):
    # Set random number generator's seed
    np.random.seed(seed)

    # Calculate sample size
    N = y.shape[0]

    # Generate treatment indicator
    # Draw random normals
    W = np.random.normal(size=N)

    # Get the treatment indicators for people in small villages
    W_0 = W[~V_1]

    # Get the treatment indicators for people in large villages
    W_1 = W[V_1]

    # Go through all small villages
    for i in range(J_0):
        # Count how many people there are in the village
        nv = sum(Vid_0[:, 0] == i)

        # Replace the treatment indicator as 1 with probability p_sm
        W_0[Vid_0[:,
                  0] == i] = (W_0[Vid_0[:, 0] == i].argsort() + 1 <= nv * p_0)

    # Go through all large villages
    for i in range(J_1):
        # Count how many people there are in the village
        nv = sum(Vid_1[:, 0] == i)

        # Replace the treatment indicator as 1 with probability p_lg
        W_1[Vid_1[:,
                  0] == i] = (W_1[Vid_1[:, 0] == i].argsort() + 1 <= nv * p_1)

    # Get the old treatment indicator back
    W[~V_lg] = W_0
    W[V_lg] = W_1

    # Make the treatment indicator into a proper vector
    W = np.array(W, ndmin=2).transpose()

    # Generate RHS data
    X = np.concatenate((x1, W, D), axis=1)

    # Estimate regression, get coefficients and t-statistics
    b, _, t = ols(y, X, cov_est='cluster', clustvar=Vid, get_p=False)

    # Return coefficient and t-statistic for treatment dummy
    return t[1, 0], b[1, 0]
Пример #3
0
def run_simulation(corr,
                   means,
                   vars,
                   T,
                   sampsis,
                   tprobs,
                   nparts,
                   nsimul,
                   nrdmax,
                   dfdef=1,
                   locdef=0,
                   scaledef=1,
                   cov_est='hmsd',
                   beta_Z=.2,
                   postau=1,
                   nest=4,
                   cnum=0,
                   prec=4,
                   sups=True,
                   mlw=110,
                   getresults=False,
                   tex=True,
                   fnamepref='results_'):
    # Inputs
    # corr: 2-element tuple, specified correlation between X and Y0, and X and
    #       tau
    # means: 3-element vector, specified means for X, Y0, and tau
    # vars: 3-element vector, specified variance for X, and variances for eps_Y0
    #       and eps_tau, see the note below
    # T: scalar, number of tuples in the simulated data
    # sampsis: vector, different sizes for random samples to draw
    # tprobs: vector, different treatment probabilities for each sample size
    # nparts: scalar, number of partitions on X
    # nsimul: scalar, number of simulations to run
    # nrdmax: scalar, maximum number of iterations to use for randomization
    #         distributions
    # dfdef: scalar, default degrees of freedom for chi2 distribution of Y0 if
    #        corr(X,Y0) = 0
    # locdef: scalar, default location parameter for Gumbel distribution of tau
    #         if corr(X,tau) = 0
    # scaledef: scalar, default scale parameter for Gumbel distribution of tau
    #         if corr(X,tau) = 0
    # cov_est: string, specifies the covariance estimator to use for the OLS
    #          estimation
    # beta_Z: scalar, used in the construction of Y0 and tau (see the note
    #         below)
    # postau: integer, position of the estimate of tau (the coefficient on the
    #         treatment dummy) in all models to be estimated
    # nest: integer, number of models to be estimated
    # cnum: integer, index of the correlation pair for the current simulation in
    #       the vector of correlation pairs (necessary to set the random number
    #       generator's seed to run this in parallel)
    # prec: integer, precision for floating point number printing in results
    # sups: boolean, if true, number which are too small to be printed using the
    #       selected printing precision will be printed as zero
    # mlw: integer, maximum line width for printing results
    # getresults: boolean, if true, the function returns the results as a pandas
    #             DataFrame (usually unnecessary, since it also prints them and
    #             can provide tex tables, see below)
    # tex: boolean, if true, saves results as tex tables
    # fnamepref: string, prefix for file names for tex tables (only matters if
    #            tex is true)
    #
    # Outputs
    # results: DataFrame, contains the results
    #
    # Note
    # To generate the three variables I need, I start with X as an exponentially
    # distributed random variable. Then, I generate the other two variables
    # based on that. Let Z denote any of them. I want to achieve
    #
    # Corr(X,Z) = Cov(X,Z) / sqrt(Var(X) Var(Z)) = gamma                     (1)
    #
    # for some gamma. I can generate
    #
    # Z = alpha + beta_Z*X + Z_eps                                           (2)
    #
    # where Z_eps is an error term, if you will. Expanding Cov(X,Z) and
    # plugging in (2) yields Cov(X,Z) = beta_Z*Var(X). Also, taking the
    # variance of (2), I have Var(Z) = beta_Z^2*Var(X) + Var(Z_eps). Plugging
    # both of these into (1) gives
    #
    # beta_Z = sqrt( (Var(X) / Var(Z_eps)) * (gamma^2 / (1 - gamma^2)) )
    #
    # and since I get to choose beta_Z, I can thereby generate random
    # variables with arbitrary correlation structure. I can then use alpha to
    # adjust the mean of the generated variable.

    # Set seed (since this will be run in parallel, it's actually important to
    # set the seed within the function, rather than outside)
    np.random.seed(666 + cnum)

    # Get scale for exponential distribution of X
    scale_X = np.sqrt(1 / vars[0])

    # Generate X as an exponentially distributed random variable (subtract the
    # mean of that distribution, which is 1/scale_X, to make sure to hit the
    # specified mean for X)
    X = means[0] - (1 / scale_X) + np.random.exponential(scale_X, size=(T, 1))

    # Let Y0_eps have a chi2 distribution
    if corr[0] != 0:
        # Calculate the necessary beta if there has to be a correlation
        beta_Y0 = (np.sqrt(
            (vars[1] / vars[0]) * ((corr[0]**2) / (1 - corr[0]**2))))
    else:
        # Otherwise, set it to zero
        beta_Y0 = 0

    # Calculate the degrees of freedom implied by this variance (this comes
    # from the fact that for a chi2(k) random variable, its variance is
    # equal to 2k)
    df_Y0 = .5 * vars[1]

    # Calculate Y0, where I need to make sure to correct for the mean of
    # the error term (the mean of a chi2(k) is simply k)
    Y0 = (means[1] - df_Y0 + beta_Y0 * X +
          np.random.chisquare(df_Y0, size=(T, 1)))

    # Let tau_eps have a Gumbel distribution
    if corr[1] != 0:
        # Calculate the necessary beta if there has to be a correlation
        beta_tau = (np.sqrt(
            (vars[2] / vars[0]) * ((corr[1]**2) / (1 - corr[1]**2))))
    else:
        # Otherwise, it's zero
        beta_tau = 0

    # Calculate the implied scale for the Gumbel distribution (a
    # Gumbel(0,b) random variable has variance b^2 (pi^2/6))
    scale_tau = np.sqrt((6 / (np.pi**2)) * vars[2])

    # Calculate tau, correcting for the fact that a Gumbel(0,b) random
    # variable has mean gb, where g is the Euler-Mascheroni constant)
    tau = (means[2] - np.euler_gamma * scale_tau + beta_tau * X +
           np.random.gumbel(0, scale_tau, size=(T, 1)))

    # Get the partition of X. First, X[:,0].argsort() gets the ranks in the
    # distribution of X. Then, nparts/T converts it into fractions of the
    # length of X. Taking the ceil() makes sure that the groups are between 1
    # and nparts. The +1 is necessary because of Python's zero indexing, which
    # leads to the lowest rank being zero, and ceil(0) = 0 when it should be
    # equal to 1.
    P = np.ceil((X[:, 0].argsort() + 1) * nparts / T)

    # Set up a set of dummies for each but one group in the partition. Since P
    # is a list, each of the checks creates a list of ones and zeros which
    # indicate whether an element of P is equal to the current i. When
    # np.array() is applied to this list of lists, it stacks them as rows of a
    # matrix. This creates an nparts - 1 by T matrix of indicator dummies. The
    # transpose converts it into a more conventional format. The last group in
    # the partition is omitted.
    D = np.array([P == i + 1 for i in range(nparts - 1)], ndmin=2).transpose()

    # Make a vector to store the mean treatment effect estimates and mean
    # standard errors. This needs one row for each sample size and each
    # treatment probability, two columns for each estimation, two columns for
    # the true tau and its standard deviations, and an extra two columns for
    # the sample size and treatment probability. (That makes it easiert to
    # print the results later.)
    tau_hats_avg = np.zeros(shape=(len(sampsis) * len(tprobs), 4 + nest * 2))

    # Go through all sample sizes
    for nsampsi, N in enumerate(sampsis):
        # Record sample size indicator in the mean estimate array
        tau_hats_avg[nsampsi * 2:nsampsi * 2 + 2, 0] = N

        # Draw random variables as the basis for a random sample of units
        I = np.random.normal(size=T)

        # Go through all groups in the partition
        for i in range(nparts):
            # Get the number of people in the group
            ngroup = sum(P == i + 1)

            # Figure out how many people to sample in this group (at least 2,
            # otherwise the treatment assignment by group will fail)
            nsamp = max(np.floor(ngroup * N / T), 2)

            # Get the treatment indicator for the current group. Get the
            # rank within group from .argsort(), add +1 to get ranks
            # starting at 1, divide by the number of people in the
            # group, and assign everyone at or below the treatment
            # probability to treatment.
            I[P == i + 1] = (I[P == i + 1].argsort() + 1) <= nsamp

        # The above mechanism could assign too few or too many units to
        # the sample. Calculate that discrepancy, as an integer.
        discrepancy = np.int(N - sum(I))

        # Check whether the discrepancy is positive
        if discrepancy >= 0:
            # If so, iterate over all 'missing' units
            for i in range(discrepancy):
                # Make a temporary vector containing all units not in the sample
                temp = I[I == 0]

                # Pick a random integer index in that vector, and assign that
                # unit to the sample
                temp[np.random.randint(0, len(temp))] = 1

                # Replace the sample assignment vector with the temporary one,
                # which means one more unit has now been assigned to treatment
                # at random.
                I[I == 0] = temp
        else:
            # If too many units were assigned, then the parameters for this
            # problem are badly set. Just print an error message.
            print(
                'Error: Between the number of tuples, the number of groups in ',
                'the partition, and the sample sizes, it is impossible to ',
                'assign at least two units from each group to the sample. ',
                'Please adjust the parameters. (This occured at N = ',
                N,
                '.)',
                sep='')

        # Annoyingly, the data type of I will now be float. To be used as an
        # index, it has to be boolean or integer. I find it easiest to convert
        # it to boolean by just check where it isn't zero.
        I = (I != 0)

        # Make an intercept for this sample size
        beta0 = np.ones(shape=(N, 1))

        # Go through all treatment probabilities
        for nprob, p in enumerate(tprobs):
            # Record treatment probability in the mean estimate array
            tau_hats_avg[nsampsi * 2 + nprob, 1] = p

            # I'll need to know how many draws of treatment vectors would be
            # needed to get the exact randomization distribution for this
            # treatment probabilty and sample size. For now, just set that up
            # as 1.
            nrdexact = 1

            # Set up an empty array to store the estimated tau_hat and its
            # standard error for each of the three models for each of the
            # simulations. (Each row is a given simulation, and each two
            # columns are for a given tau_hat and its standard error.)
            tau_hats = np.zeros(shape=(nsimul, nest * 2))

            # Go through all simulations for the current set of parameters
            for s in range(nsimul):
                # Draw random variables as basis for treatment indicator
                W = np.random.normal(size=(N, 1))

                # Go through all groups in the partition
                for i in range(nparts):
                    # Get number of people in the group n
                    ngroup = sum(P[I] == i + 1)

                    # Get number of treated units k
                    ntreat = max(np.floor(p * ngroup), 1)

                    # Get the treatment indicator for the current group. Get the
                    # rank within group from .argsort(), add +1 to get ranks
                    # starting at 1.
                    W[P[I] == i + 1,
                      0] = W[P[I] == i + 1, 0].argsort() + 1 <= ntreat

                    # Check whether this is the first group and the first
                    # simulation. If so, do the calculations required for the
                    # number of draws in the randomization distribution. It's
                    # convenient to do this now, since it saves some loops later
                    # on.
                    if s == 0 and i == 0:
                        # If so, calculate n choose k for this group, and save
                        # the result. (I originally did this by hand using
                        # factorials, but using this has the nice side effect
                        # of being evaluated as np.inf (positive infinity) in
                        # case this runs into overflow issues, whereas my code
                        # would result in a NaN, which I would then manually
                        # have to change into an Inf.)
                        nrdexact = binomial(ngroup, ntreat)
                    elif s == 0:
                        # If it's the first sumlation but not the first group,
                        # get n choose k, and multiply it by the number of
                        # possible assignments of all other groups calculated
                        # so far
                        nrdexact = nrdexact * binomial(ngroup, ntreat)

                # Generate observed outcome for the simulation regressions
                Yobs = Y0[I, :] + tau[I, :] * W

                # Generate RHS data sets for the simulation regressions
                # The first data set is just an intercept and a treatment dummy
                Z1 = np.concatenate((beta0, W), axis=1)

                # The second data set contains the covariate X
                Z2 = np.concatenate((beta0, W, X[I, :]), axis=1)

                # The third data set also includes the partition dummies
                Z3 = np.concatenate((beta0, W, D[I, :]), axis=1)

                # The fourth data set also includes an interaction between the
                # treatment dummy and the partition dummies
                Z4 = np.concatenate(
                    (beta0, W, D[I, :],
                     (W @ np.ones(shape=(1, nparts - 1))) * D[I, :]),
                    axis=1)

                # Estimate the first two regression models and store the
                # estimates in the tau_hats array, in row s
                for i, Z in enumerate([Z1, Z2, Z3]):
                    # Estimate the model
                    beta_hat, S_hat = ols(Yobs, Z, cov_est=cov_est)

                    # Store the estimates. The row index is easy. For the column
                    # index, it's important to remember Python's zero indexing,
                    # and how it assigns elements to indices. This maps counter
                    # i to index [j,k] as
                    #
                    # 0 -> [0,2], 1 -> [2,4], ...
                    #
                    # and for any given index [j,k], Python will try to assign
                    # contents to the elements j,j+1,...,k-1, but not to k
                    # itself. Therefore, this gets me the right indices for a
                    # two element assignment.
                    tau_hats[s, 2 * i:2 * i + 2] = (beta_hat[postau, 0],
                                                    np.sqrt(S_hat[postau,
                                                                  postau]))

                # For the saturated model, I need to get the average treatment
                # effect. First, estimate the model.
                beta_hat, S_hat = ols(Yobs, Z4, cov_est=cov_est)

                # Set up a vector of linear constraints on tau
                L = np.zeros(shape=(beta_hat.shape))

                # Replace the element corresponding to the base effect as one,
                # since every group in the partition has this as part of their
                # estimated effect
                L[postau, 0] = 1

                # Go through all groups in the partition for which there are
                # dummies in D
                for i in range(nparts - 1):
                    # Get the number of treated units
                    ntreat = sum(W[:, 0])

                    # Get the number of treated units in this group
                    ntreatgroup = sum((P[I] == i + 1) * (W[:, 0] == 1))

                    # Replace the corresponding element of L with the
                    # probability of being in this group, conditional on being
                    # a treated unit. The position of that element is equal to
                    # the length of beta_hat minus the number of groups in the
                    # partition minus one plus the number of the group under
                    # consideration. That is
                    #
                    # beta_hat.shape[0]-(nparts-1)+i
                    # = beta_hat.shape[0]-nparts+i+1
                    #
                    # remembering that due to Python's zero indexing, the number
                    # of the group is i+1, not i.
                    L[beta_hat.shape[0] - nparts + i + 1,
                      0] = ntreatgroup / ntreat

                # Calculate the average treatment effect for the saturated
                # model
                tau_hat_avg_satu = L.transpose() @ beta_hat

                # Calculate the estimated variance
                S_hat_satu = L.transpose() @ S_hat @ L

                # Store the estimate and its standard error
                tau_hats[s, 2 * (nest - 1):] = (tau_hat_avg_satu,
                                                np.sqrt(S_hat_satu))

            # Store the average estimates and standard errors for all three
            # models, for the current sample size and treatment probability
            tau_hats_avg[nsampsi * 2 + nprob, 4:] = np.mean(tau_hats, axis=0)

            # Set up an array to store the randomization distribution of tau_hat
            # (or the maximum number of simulation draws used to approximate it,
            # if getting the exact distribution is not feasible.)
            tau_true = np.zeros(shape=(np.int(np.minimum(nrdexact, nrdmax)),
                                       1))

            # Check whether the number of iterations required to get the exact
            # randomization distribution exceeds the maximum allowable number
            # of iterations
            if nrdexact <= nrdmax:
                # If so, set up an empty list
                A = []

                # Go through all groups in the partition
                for i in range(nparts):
                    # Get number of people in the group n
                    ngroup = sum(P[I] == i + 1)

                    # Get number of treated units k
                    ntreat = np.int(max(np.floor(p * sum(P[I] == i + 1)), 1))

                    # Get all assignment vectors for this group, and add them
                    # to the list
                    A.append(combinations(range(ngroup), ntreat))

                # Get the Cartesian product of the assignment vectors for all
                # groups. Note that the asterisk matters, because that unpacks
                # A, which is a list of lists, before getting the product.
                # (Otherwise, this will just return the same three lists, since
                # A itself has size one: It is a single list of lists. So
                # without unpacking it first, product() gets the Cartesian
                # product of A with itself, which is just A.)
                A = product(*A)

                # Go through all possible assignment vectors
                for s, a in enumerate(list(A)):

                    # Set up treatment assignment as a vector of zeros
                    W = np.zeros(shape=(N, 1))

                    # Go through all groups in the partition
                    for i in range(nparts):
                        # Get the assignment vector for that group
                        temp = W[P[I] == i + 1]

                        # Replace is as one as appropriate
                        temp[a[i], 0] = 1

                        # Replace the assignment vector
                        W[P[I] == i + 1] = temp

                    # Generate observed outcome for this assignment
                    Yobs = Y0[I, :] + tau[I, :] * W

                    # Put together the RHS variables
                    Z1 = np.concatenate((beta0, W), axis=1)

                    # Run the regression
                    beta_hat_simp = ols(Yobs, Z1, get_cov=False)

                    # Store the result
                    tau_true[s, 0] = beta_hat_simp[postau, 0]
            else:
                # If getting the exact randomization distribution is too
                # computationally intensive, go through the maximum number of
                # allowable iterations
                for s in range(nrdmax):
                    # Here, the treatment assignment is just as for the
                    # simulations above
                    # Draw random variables as basis for treatment indicator
                    W = np.random.normal(size=(N, 1))

                    # Go through all groups in the partition
                    for i in range(nparts):
                        # Get number of people in the group n
                        ngroup = sum(P[I] == i + 1)

                        # Get number of treated units k
                        ntreat = max(np.floor(p * ngroup), 1)

                        # Get the treatment indicator for the current group.
                        # Get the rank within group from .argsort(), add +1 to
                        # get ranks starting at 1.
                        W[P[I] == i + 1,
                          0] = W[P[I] == i + 1, 0].argsort() + 1 <= ntreat

                    # Generate observed outcome for this assignment
                    Yobs = Y0[I, :] + tau[I, :] * W

                    # Put together the RHS variables
                    Z1 = np.concatenate((beta0, W), axis=1)

                    # Run the regression
                    beta_hat_simp = ols(Yobs, Z1, get_cov=False)

                    # Store the result
                    tau_true[s, 0] = beta_hat_simp[postau, 0]

            # Store the expected value of tau
            tau_hats_avg[nsampsi * 2 + nprob, 2] = np.mean(tau_true, axis=0)

            # Store the standard deviation of tau
            tau_hats_avg[nsampsi * 2 + nprob, 3] = np.std(tau_true, axis=0)

    # Set display options (has to be done within each function if this runs in
    # parallel)
    pd.set_option('display.max_columns', tau_hats_avg.shape[1])
    pd.set_option('display.width', mlw)
    pd.set_option('display.precision', prec)

    # Make a header line for the results, starting with the basic parameters
    firstline = ['N', 'p', 'tau', 'SD']

    # Use Python's amazing list comprehension to make a list that goes,
    # [tau_hat 1, SE 1, tau_hat 2, SE 2, ...]
    firstline.extend(x for i in range(nest)
                     for x in ['tau_hat ' + str(i + 1), 'SE ' + str(i + 1)])

    # Put the results in a pandas DataFrame
    results = pd.DataFrame(data=tau_hats_avg, columns=firstline)

    # Make sure sample sizes are stored as integers
    results['N'] = results['N'].astype(int)

    # Get the variances for Y0 and tau to display them
    d_var_Y0 = (beta_Y0**2) * vars[0] + vars[1]
    d_var_tau = (beta_tau**2) * vars[0] + vars[2]

    # Print the results
    print('Correlations: corr(X,Y0) = ',
          corr[0],
          ', corr(X,tau) = ',
          corr[1],
          '\n',
          'Variances: V[Y0] = ',
          np.around(d_var_Y0, prec),
          ', V[tau] = ',
          np.around(d_var_tau, prec),
          '\n',
          results,
          '\n',
          sep='')

    # Check whether to export to latex
    if tex:
        # Save the results as a tex table
        results.to_latex(fnamepref + str(cnum) + '.tex', index=False)

    # If desired, return results DataFrame
    if getresults:
        return results
def permute_p(Y,
              Isamp,
              ntreat,
              balvars,
              prank,
              X=None,
              Z=None,
              seed=1,
              Breg=10,
              breg_icept=True,
              cov_est='hmsd',
              order='F',
              shape=None):
    # Inputs
    # Y: [N,M] matrix, data for each of the M outcomes in the family
    # Isamp: [N,1] vector, estimation sample to use. (If treatment assignment is
    #        at a higher level than the estimation, e.g. because the estimatiion
    #        is for a follow-up sample, this function will reassign treatment at
    #        the higher level, but use only the subsample that survives to
    #        follow-up for any estimations.)
    # ntreat: scalar, number of treated units
    # balvars: [N,B] matrix, data for each of the B balancing variables
    # prank: [M,1] list-like (one dimensional), original order of the p-values,
    #        from smallest to largest
    # X: [N,D] matrix, data for covariates to include when estimating
    #    regressions, but without saving their p-values
    # Z: [N,E] matrix, data for covariates of interest, will be included in the
    #    estimations, and their p-values will be recorded
    # seed: scalar, random number generator's seed
    # Breg: scalar, number of balancing regressions to use
    # breg_icept: boolean, if true, balancing regressions will include an
    #             intercept
    # cov_est: string, covariance estimator to use (see ols() in linreg.py for
    #          available options)
    # order: string, one of 'C', 'F', or 'A', specificies how to flatten and
    #        reshape the input p-values (does not really matter)
    # shape: tuple, shape into which to reshape the p-values before outputting
    #
    # Outputs
    # p_star: vector or matrix, shape depends on whether Z is included, and
    #         whether shape was specified,  permutation p-values for one
    #         iteration of the free step-down randomization

    # Set random number generator's seed
    np.random.seed(seed)

    # Get total sample size N and number of outcome variables M
    N, M = Y.shape

    # Figure out the index of the parameter of interest
    if X is None and Z is None:
        # If no other RHS variables are being used, it's just the first element
        # of the estimates vector
        cidx = [0]
    elif X is not None and Z is None:
        # If RHS variables are being inserted before it, figure out how many,
        # and use the next index
        cidx = [X.shape[1]]
    else:
        # If RHS variables of interest are included after it, include their
        # indices as well
        if X is none:
            # If X wasn't specified, just make a [0,0] array for it
            X = np.empty(shape=(0, 0))

        # Get the coefficient indices
        cidx = [X.shape[1]] + [z + X.shape[1] + 1 for z in range(Z.shape[1])]

    # Get number of tests
    T = M * len(cidx)

    # Set up vector of treatment assignments for this iteration
    W = np.zeros(shape=(N, 1))

    # Set up a place to save the smallest maximum t-statistic recorded so far
    tmax = np.inf

    # Go through all balancing regressions
    for b in range(Breg):
        # Get new treatment assignment, by drawing randomly from a standard
        # normal distribution, getting the rank (adjusting by +1 to account for
        # Python's zero indexing), and assigning everyone with a rank equal to
        # or below the number of treated units to treatment
        Wb = np.random.normal(size=(N, 1))
        Wb = np.array((Wb[:, 0].argsort() + 1 <= ntreat), ndmin=2).transpose()

        # Set up vector of t-statistics for this treatment assignment
        t = np.zeros(shape=(balvars.shape[1], 1))

        # Go through all balancing variables
        for i in range(balvars.shape[1]):
            # Get indices of non-missing observations for that variable
            I = ~np.isnan(balvars[:, i])

            # Make an array of balancing variables
            Xbv = np.array(balvars[I, i], ndmin=2).transpose()

            # Check whether an intercept needs to be added
            if breg_icept:
                # If so, combine the balancing variable with an intercept
                Xbv = (np.concatenate((np.ones(shape=(np.sum(I), 1)), Xbv),
                                      axis=1))

            # Run OLS of treatment assignemnt on balancing variable, get the
            # t-statistic
            _, _, tb, _ = ols(Wb[I, :], Xbv, cov_est=cov_est)

            # Allocate correct t-statistic to the vector of t-statistics
            if breg_icept:
                # If there is an intercept, use the second one
                t[i, 0] = tb[1, 0]
            else:
                # Otherwise, there is only one to use
                t[i, 0] = tb[0, 0]

        # Check whether the largest absolute value of any t-statistic across all
        # balancing regressions is less than the maximum recorded so far. Note
        # that np.minimum() and np.maximum() take exactly two arrays as inputs,
        # and compute the element-wise min or max. Things get funky once they
        # compare inputs of two different shapes. To get the min or max of the
        # elements of a single array, I have to use np.amax() or np.amin()
        # instead.
        if np.amax(np.abs(t[:, 0])) <= tmax:
            # If so, save the new minmax t-statistic
            tmax = np.amax(np.abs(t[:, 0]))

            # Save the treatment assignment
            W = Wb

    # Set up a vector of p-values, one for each outcome variable and coefficient
    # of interest
    pstar = np.zeros(shape=(T, 1))

    # Go through all members in the family
    for i in range(T):
        # Make an index of where both the member and all parts of X are not NaN,
        # and only get units in the estimation sample, i.e. where Isamp == 1
        I = (~np.isnan(Y[:, i]) & ~np.isnan(X.sum(axis=1)) & Isamp)

        # Get the number of effective observations
        n = I.sum()

        # Get outcome variable, i.e. the current member of the family, for those
        # observations with non-missing data for both LHS and RHS variables
        y = np.array(Y[I, i], ndmin=2).transpose()

        # Make a matrix of RHS variables. If X was specified, add it to the
        # treatment assignment
        if X is not None:
            Xstar = np.concatenate((X[I], W[I]), axis=1)
        else:
            Xstar = W[I]

        # If Z was specified, add it in after the treatment assignment
        if Z is not None:
            Xstar = np.concatenate((Xstar, Z[I]), axis=1)

        # Run OLS
        _, _, _, p = ols(y, Xstar, cov_est=cov_est)

        # Save only p-values of interest
        pstar[i, :] = p[cidx]
    # Reorder p-values in the original order (lowest to highest)
    pstar_reord = pstar[prank]

    # Got through all p-values
    for i in range(T):
        # Replace them as the minimum across all originally larger p-values
        pstar_reord[i, :] = np.amin(pstar_reord[i:, :])

    # Put p-values back into the order of hypotheses being tested
    for sorti, origi in enumerate(prank):
        pstar[origi] = pstar_reord[sorti]

    # Reshape the result if desired
    if shape is not None:
        pstar = pstar.reshape(order=order)

    # Return the adjusted p-values
    return pstar
    for i in range(M):
        # Make an index of where both the member and all parts of X are not NaN
        I = (~np.isnan(Y[:, i]) & ~np.isnan(Xresitt.sum(axis=1)))

        # Get the number of effective observations
        n = I.sum()

        # Save it for printing later
        N[i, 0] = n

        # Get outcome variable, i.e. the current member of the family, for those
        # observations with non-missing data for both LHS and RHS variables
        y = np.array(Y[I, i], ndmin=2).transpose()

        # Run OLS
        bhat, Vhat, _, p = ols(y, Xresitt[I], cov_est='hmsd')

        # Save p-values
        p_unadj[i, :] = p[cidx, 0]

        # Save point estimates for coefficients of interest
        b[i, :] = bhat[cidx, 0]

        # Save standard errors
        SE[i, :] = np.sqrt(np.diag(Vhat[cidx, cidx]))

    ############################################################################
    ### Part 5.1: Bonferroni, Holm-Bonferroni
    ############################################################################

    # Calculate Bonferroni-adjusted p-values
Пример #6
0
                         index=['y', 'stat', 'Constant'] + list(Xvars_sw))

# Go through all switching measures
for i, measure in enumerate(switching_measures_reg):
    # Get the variable in question
    var = switching_measures_reg[measure]

    # Make the LHS variable into a column vector
    y = larry(insurance_data_red[var])

    # Figure out where y and X are both not missing
    I = ~np.isnan(y[:, 0]) & ~np.isnan(X_sw.sum(axis=1))

    # Run OLS
    bhat, _, _, p = ols(y[I, :],
                        X_sw[I, :],
                        cov_est='cluster',
                        clustvar=clusters[I, :])

    # Add outcome name to results DataFrame
    switchreg.iloc[0, 2 * i:2 * i + 2] = measure

    # Add column labels for beta_har and p-value
    switchreg.iloc[1, 2 * i] = 'b'
    switchreg.iloc[1, 2 * i + 1] = 'p'

    # Add results
    switchreg.iloc[2:, 2 * i] = bhat[:, 0]
    switchreg.iloc[2:, 2 * i + 1] = p[:, 0]

# Set outcomes and beta_hat / p-values as headers for switching results
switchreg = switchreg.T.set_index(['y', 'stat']).T