if gender == "" or genderdata[userID]["gender"] == gender:
                        x.append(cnt)
                        y.append(weekdata[str(day)][userID]["RW"])
            cnt += 1
        if str(day) in weekdata and str(day) == str(changedate):
            change_cnt = cnt
            print()
            print(
                str(change_cnt) + ": " + str(day) +
                " (excluded week as cut point for treatment)")
            cnt += 1

data = pd.DataFrame({'y': y, 'x': x})

#bandwidth_opt = rdd.optimal_bandwidth(data['y'], data['x'], cut=change_cnt)
#logging.info("Optimal bandwidth:" + str(bandwidth_opt))

data_rdd = rdd.truncated_data(data, 'x', bandwidth, cut=change_cnt)

print()
print("Number of observations per week in this model: ")
print(data_rdd["x"].value_counts())
print()
print()
model = rdd.rdd(data_rdd, 'x', 'y', cut=change_cnt)
print()
print(model.fit().summary())
print()
log_endtime = datetime.datetime.now()
log_runtime = (log_endtime - log_starttime)
logging.info("Total runtime: " + str(log_runtime))
The result of the multiple regression looks counter intuitive. How the sign of the treatment variable can change? 

Let's look at data from other perspective. The graph "Naive Comparison" is the scatterplot of all municipalities individually. It is hard to see any pattern or trends. 

Let's plot the same graphic, but with municipalities aggregated in 29 bins based on similarity of the outcome variable high school completion. These bins are the blue balls in the graphic below. The size of the ball is proportional to the number of municipalities used to calculate the mean value of high school completion. 

If you look carefully near the cut-off (vertical red line), where the variable Islamic win margin = 0, you will see a discontinuity or a jump in the level of high school completion. 

# Library for Regression Discontinuity
!pip install rdd

from rdd import rdd

# Aggregate the data in 29 bins
threshold = 0
data_rdd = rdd.truncated_data(df, 'iwm94', 0.99, cut=threshold)
data_binned = rdd.bin_data(data_rdd, 'hischshr1520f', 'iwm94', 29)

# Labels
plt.title('Comparison using aggregate data (Bins)')
plt.xlabel('Islamic win margin')
plt.ylabel('Female aged 15-20 with high school')

# Scatterplot 
plt.scatter(data_binned['iwm94'], data_binned['hischshr1520f'],
    s = data_binned['n_obs'], facecolors='none', edgecolors='blue')

# Red Vertical Line
plt.axvline(x=0, color='red')

plt.show()
Exemplo n.º 3
0
data = pd.DataFrame({'y': y, 'x': x, 'w1': w1, 'w2': w2})

# TEST optimal_bandwidth()
print("optimal_bandwidth() tests:")
flag_optimal_bandwidth = 0

h = rdd.optimal_bandwidth(data['y'], data['x'], 1)
if np.round(h, 5) != .75117:
    print("\tFAIL: value of h is wrong")
    flag_optimal_bandwidth = 1
if flag_optimal_bandwidth == 0:
    print("\tNo Failures")

# TEST truncated_data()

data_rdd = rdd.truncated_data(data, 'x', h, cut=1)

# TEST rdd()

model = rdd.rdd(data_rdd, 'x', 'y', cut=1)

print(model.fit().summary())

# TEST bin_data()

data_binned = rdd.bin_data(data, 'y', 'x', 100)

plt.figure()
plt.scatter(data_binned['x'],
            data_binned['y'],
            s=data_binned['n_obs'],
Exemplo n.º 4
0
N = 10000
x = np.random.normal(1, 1, N)
epsilon = np.random.normal(0, 1, N)
threshold = 1
treatment = np.where(x >= threshold, 1, 0)
w1 = np.random.normal(0, 1, N)
w2 = np.random.normal(0, 4, N)
y = .5 * treatment + 2 * x - .2 * w1 + 1 + epsilon

data = pd.DataFrame({'y': y, 'x': x, 'w1': w1, 'w2': w2})
data.head()

bandwidth_opt = rdd.optimal_bandwidth(data['y'], data['x'], cut=threshold)

print("Optimal bandwidth:", bandwidth_opt)
data_rdd = rdd.truncated_data(data, 'x', bandwidth_opt, cut=threshold)

#x = running variable
#y = outcome variables
model = rdd.rdd(data_rdd, 'x', 'y', cut=threshold)
print(model.fit().summary())

df_RD = pd.io.stata.read_stata(
    r'/Users/gopaljuneja/Desktop/Microenterprise_Kenya/113714-V1/App2017-0042_data/datasets/RD_Dataset.dta'
)
df_RD.to_csv('/Users/gopaljuneja/Desktop/Reproduced_MEK/RD_Dataset.csv')

bw100 = 100
bw150 = 150
bw200 = 200
threshold = 1