if gender == "" or genderdata[userID]["gender"] == gender: x.append(cnt) y.append(weekdata[str(day)][userID]["RW"]) cnt += 1 if str(day) in weekdata and str(day) == str(changedate): change_cnt = cnt print() print( str(change_cnt) + ": " + str(day) + " (excluded week as cut point for treatment)") cnt += 1 data = pd.DataFrame({'y': y, 'x': x}) #bandwidth_opt = rdd.optimal_bandwidth(data['y'], data['x'], cut=change_cnt) #logging.info("Optimal bandwidth:" + str(bandwidth_opt)) data_rdd = rdd.truncated_data(data, 'x', bandwidth, cut=change_cnt) print() print("Number of observations per week in this model: ") print(data_rdd["x"].value_counts()) print() print() model = rdd.rdd(data_rdd, 'x', 'y', cut=change_cnt) print() print(model.fit().summary()) print() log_endtime = datetime.datetime.now() log_runtime = (log_endtime - log_starttime) logging.info("Total runtime: " + str(log_runtime))
The result of the multiple regression looks counter intuitive. How the sign of the treatment variable can change? Let's look at data from other perspective. The graph "Naive Comparison" is the scatterplot of all municipalities individually. It is hard to see any pattern or trends. Let's plot the same graphic, but with municipalities aggregated in 29 bins based on similarity of the outcome variable high school completion. These bins are the blue balls in the graphic below. The size of the ball is proportional to the number of municipalities used to calculate the mean value of high school completion. If you look carefully near the cut-off (vertical red line), where the variable Islamic win margin = 0, you will see a discontinuity or a jump in the level of high school completion. # Library for Regression Discontinuity !pip install rdd from rdd import rdd # Aggregate the data in 29 bins threshold = 0 data_rdd = rdd.truncated_data(df, 'iwm94', 0.99, cut=threshold) data_binned = rdd.bin_data(data_rdd, 'hischshr1520f', 'iwm94', 29) # Labels plt.title('Comparison using aggregate data (Bins)') plt.xlabel('Islamic win margin') plt.ylabel('Female aged 15-20 with high school') # Scatterplot plt.scatter(data_binned['iwm94'], data_binned['hischshr1520f'], s = data_binned['n_obs'], facecolors='none', edgecolors='blue') # Red Vertical Line plt.axvline(x=0, color='red') plt.show()
data = pd.DataFrame({'y': y, 'x': x, 'w1': w1, 'w2': w2}) # TEST optimal_bandwidth() print("optimal_bandwidth() tests:") flag_optimal_bandwidth = 0 h = rdd.optimal_bandwidth(data['y'], data['x'], 1) if np.round(h, 5) != .75117: print("\tFAIL: value of h is wrong") flag_optimal_bandwidth = 1 if flag_optimal_bandwidth == 0: print("\tNo Failures") # TEST truncated_data() data_rdd = rdd.truncated_data(data, 'x', h, cut=1) # TEST rdd() model = rdd.rdd(data_rdd, 'x', 'y', cut=1) print(model.fit().summary()) # TEST bin_data() data_binned = rdd.bin_data(data, 'y', 'x', 100) plt.figure() plt.scatter(data_binned['x'], data_binned['y'], s=data_binned['n_obs'],
N = 10000 x = np.random.normal(1, 1, N) epsilon = np.random.normal(0, 1, N) threshold = 1 treatment = np.where(x >= threshold, 1, 0) w1 = np.random.normal(0, 1, N) w2 = np.random.normal(0, 4, N) y = .5 * treatment + 2 * x - .2 * w1 + 1 + epsilon data = pd.DataFrame({'y': y, 'x': x, 'w1': w1, 'w2': w2}) data.head() bandwidth_opt = rdd.optimal_bandwidth(data['y'], data['x'], cut=threshold) print("Optimal bandwidth:", bandwidth_opt) data_rdd = rdd.truncated_data(data, 'x', bandwidth_opt, cut=threshold) #x = running variable #y = outcome variables model = rdd.rdd(data_rdd, 'x', 'y', cut=threshold) print(model.fit().summary()) df_RD = pd.io.stata.read_stata( r'/Users/gopaljuneja/Desktop/Microenterprise_Kenya/113714-V1/App2017-0042_data/datasets/RD_Dataset.dta' ) df_RD.to_csv('/Users/gopaljuneja/Desktop/Reproduced_MEK/RD_Dataset.csv') bw100 = 100 bw150 = 150 bw200 = 200 threshold = 1