def do_km(name, time, censor, split, outdir): """Given three clean (pre-processed) lists, make a kmplot of the data, and save it to outdir""" data = { 'time': robjects.IntVector(np.array(time)), 'censor': robjects.IntVector(np.array(censor)), 'split': robjects.IntVector(np.array(split)) } df = robjects.DataFrame(data) surv = importr('survival') grdevices = importr('grDevices') km = surv.survfit(robjects.Formula('Surv(time, censor) ~ split'), data=df) grdevices.png(file=os.path.join(outdir, name + '_km.png'), width=512, height=512) r.plot(km, xlab='Time', ylab='Cumulative Hazard', col=robjects.StrVector(['Red', 'Blue'])) r.legend(1000, 1, robjects.StrVector(['<= Mean', '> Mean']), lty=robjects.IntVector([1, 1]), col=robjects.StrVector(['Red', 'Blue'])) grdevices.dev_off()
def val(self): """ Estimate value functions with b-splines and compare """ new_data = pd.DataFrame({'OverallRank': np.linspace(1, 194, 1000)}) fit_a = self.spline_est(self.policy_a['value'], new_data) fit_b = self.spline_est(self.policy_b['value'], new_data) r.pdf(os.path.join(os.path.dirname(self.out_dir), 'value.pdf')) r.plot(new_data['OverallRank'], fit_a, type='l', xlab='Rank_M', ylab='V(Rank)') r.lines(new_data['OverallRank'], fit_b, col='red') r.points(self.policy_a['value']['OverallRank'], self.policy_a['value']['val'], col='black') r.points(self.policy_b['value']['OverallRank'], self.policy_b['value']['val'], col='red') r.legend('topright', np.array(['No Info', 'Info']), lty=np.array([1, 1]), col=np.array(['black', 'red'])) r('dev.off()') diff = np.array(fit_b) - np.array(fit_a) r.pdf(os.path.join(os.path.dirname(self.out_dir), 'value_diff.pdf')) r.plot(new_data['OverallRank'], diff, type='l', xlab='Rank', ylab='V(Rank|info=1) - V(Rank|info=0)') r.abline(h=0, lty=2) r('dev.off()') diff = (np.array(fit_b) - np.array(fit_a)) / np.array(fit_a) r.pdf(os.path.join(os.path.dirname(self.out_dir), 'value_percent_diff.pdf')) r.plot(new_data['OverallRank'], diff, type='l', xlab='Rank', ylab='(V(Rank|info=1) - V(Rank|info=0)) / V(Rank|info=0)') r.abline(h=0, lty=2) r('dev.off()') data_path = dirname(dirname(__file__)) data_path = join(data_path, 'data', 'lawData.csv') data = pd.read_csv(data_path) new_data = deepcopy(data.loc[data['year'] == 2013, 'OverallRank']) #new_data = np.concatenate(( # new_data, np.zeros(lc.N_SCHOOLS - len(new_data)) #)) new_data = pd.DataFrame({'OverallRank': np.array(new_data)}) fit_a = self.spline_est(self.policy_a['value'], new_data) fit_b = self.spline_est(self.policy_b['value'], new_data) diff = np.sum(np.array(fit_b) - np.array(fit_a)) pdiff = diff / np.sum(fit_a) print(" - Change in Producer Surplus: {0}".format(diff)) print(" - Percent change in Producer Surplus: {0}".format(pdiff)) return diff
def draw_survival_curves(feature, surv, assignment=None, filename='tmp.png', show=False, title=True, labels=None, colors=['blue', 'red'], ann=None, show_legend=True, q=.25, std=None): if assignment is None: num_panels = 1 assignment = feature.map(lambda s: 1) name = lambda v: str(feature.name) if feature.name != None else '' else: num_panels = len(assignment.unique()) name = lambda v: str(assignment.name) + ' = ' + str(v) if (labels is None) and ((len(feature) / feature.nunique()) > 10): labels = r.sort(r.c(*feature.unique())) # R sorts bad colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black'] if feature.dtype == 'bool': feature = feature.map({True: 'True', False: 'False'}) r.png(filename=filename, width=200 * (num_panels + 1), height=300, res=75) fmla = robjects.Formula('Surv(days, event) ~ feature') r.par(mfrow=r.c(1, num_panels)) r.par(mar=r.c(4, 5, 4, 1)) r.par(xpd=True) if (get_vec_type(feature) == 'real') and (len(feature.unique()) > 10): colors = ['blue', 'orange', 'red'] if q == .5: labels = ['Bottom 50%', 'Top 50%'] else: labels = [ 'Bottom {}%'.format(int(q * 100)), 'Normal', 'Top {}%'.format(int(q * 100)) ] ls = r.c(*colors) def plot_me(sub_f, label): if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10): sub_f = to_quants(sub_f, q=q, std=std) m = get_cox_ph(surv, sub_f, formula=fmla) r_data = m.rx2('call')[2] p = log_rank(sub_f, surv)['p'] ls = r.c(*colors) r.plot(survival.survfit(fmla, r_data), lty=1, col=ls, lwd=4, cex=1.25, xlab='Years to Event', ylab='Survival') r.title(label, cex=3.) if ann == 'p': r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4) elif ann != None: r.text(0, labels=ann, pos=4) if show_legend == 'out': r.par(xpd=True, mar=r.c(4, 5, 5, 8)) for value in sorted(assignment.ix[feature.index].dropna().unique()): f = feature.ix[assignment[assignment == value].index] if len(f.unique()) > 1: plot_me(f, name(value)) if show_legend == True: mean_s = surv.ix[:, 'event'].ix[assignment[assignment == value].index].mean() if mean_s < .5: r.legend(surv.ix[:, 'days'].max() * .05 / 365., .45, labels, lty=1, col=ls, lwd=3, bty='o') else: r.legend(surv.ix[:, 'days'].max() * .4 / 365, .9, labels, lty=1, col=ls, lwd=3, bty='o') elif show_legend == 'out': r.legend(surv.ix[:, 'days'].max() * 1.1 / 365, .9, labels, lty=1, col=ls, lwd=3, bty='o') r('dev.off()') if show: return Show(filename)
def draw_survival_curves(feature, surv, assignment=None, filename='tmp.png', show=False, title=True, labels=None, colors=['blue', 'red'], ann=None, show_legend=True, q=.25, std=None): if assignment is None: num_panels = 1 assignment = feature.map(lambda s: 1) name = lambda v: str(feature.name) if feature.name != None else '' else: num_panels = len(assignment.unique()) name = lambda v: str(assignment.name) + ' = ' + str(v) if (labels is None) and ((len(feature) / feature.nunique()) > 10): labels = r.sort(r.c(*feature.unique())) # R sorts bad colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black'] if feature.dtype == 'bool': feature = feature.map({True: 'True', False: 'False'}) r.png(filename=filename, width=200 * (num_panels + 1), height=300, res=75) fmla = robjects.Formula('Surv(days, event) ~ feature') r.par(mfrow=r.c(1, num_panels)) r.par(mar=r.c(4, 5, 4, 1)) r.par(xpd=True) if (get_vec_type(feature) == 'real') and (len(feature.unique()) > 10): colors = ['blue', 'orange', 'red'] if q == .5: labels = ['Bottom 50%', 'Top 50%'] else: labels = ['Bottom {}%'.format(int(q * 100)), 'Normal', 'Top {}%'.format(int(q * 100))] ls = r.c(*colors) def plot_me(sub_f, label): if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10): sub_f = to_quants(sub_f, q=q, std=std) m = get_cox_ph(surv, sub_f, formula=fmla) r_data = m.rx2('call')[2] p = log_rank(sub_f, surv)['p'] ls = r.c(*colors) r.plot(survival.survfit(fmla, r_data), lty=1, col=ls, lwd=4, cex=1.25, xlab='Years to Event', ylab='Survival'); r.title(label, cex=3.) if ann == 'p': r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4) elif ann != None: r.text(0, labels=ann, pos=4) if show_legend == 'out': r.par(xpd=True, mar=r.c(4, 5, 5, 8)) for value in sorted(assignment.ix[feature.index].dropna().unique()): f = feature.ix[assignment[assignment == value].index] if len(f.unique()) > 1: plot_me(f, name(value)) if show_legend == True: mean_s = surv.ix[:, 'event'].ix[assignment[assignment == value].index].mean() if mean_s < .5: r.legend(surv.ix[:, 'days'].max() * .05 / 365., .45, labels, lty=1, col=ls, lwd=3, bty='o') else: r.legend(surv.ix[:, 'days'].max() * .4 / 365, .9, labels, lty=1, col=ls, lwd=3, bty='o') elif show_legend == 'out': r.legend(surv.ix[:, 'days'].max() * 1.1 / 365, .9, labels, lty=1, col=ls, lwd=3, bty='o') r('dev.off()') if show: return Show(filename)