def roll_out(hps, Env, model, sess, s, e, seed): ''' Performs a simple rollout ''' rollout_data = PartialRollout() if seed is not None: rollout_data.seed = seed while (not rollout_data.terminal) and (rollout_data.t < hps.max_roll_len): local_seed = seed if seed is not None else [ np.random.randint(1e15), np.random.randint(1e15) ] # sample the local seed # Policy if hps.policy == 'thompson': a = thompson_policy(s[None, :], model, sess, hps, seed=local_seed) elif hps.policy == 'egreedy': e = sess.run(model.epsilon) a = egreedy_policy(s[None, :], model, sess, hps, e=e, seed=local_seed) elif hps.policy == 'ucb': a = ucb_policy(s[None, :], model, sess, hps, seed=local_seed) a = a[0] # Steps s1, r, terminal, info = Env.step( correct_action_dim(a, model.action_discrete)) s1 = correct_dim(s1) rollout_data.add(s, a, [r], terminal) s = s1 rollout_data.add_last_state(s) # add the last state return rollout_data, s
def offpolicy_argmax(sess,model,sb,action_dim,seed,hps,off_policy_on_mean): ''' returns the action index for the off policy decision important parameter: if off_policy_on_mean = True, then we consider the mean of the output''' if hps.policy == 'thompson': a = thompson_policy(sb,model,sess,hps,seed,eval_on_mean_output=off_policy_on_mean,eval_on_mean_params=False) elif hps.policy == 'egreedy': e = 0.0 a = egreedy_policy(sb,model,sess,hps,e=e,seed=seed) elif hps.policy == 'ucb': a = ucb_policy(sb,model,sess,hps,seed,eval_on_mean_output=off_policy_on_mean,eval_on_mean_params=False) return a
def evaluate(Env, hps, model, sess): R_ep = [] R_av = [] for i in range(hps.n_eval): s = Env.reset() s = correct_dim(s) R = 0 seed = [np.random.randint(1e15), np.random.randint(1e15)] if hps.fix_seed_per_roll_out else None for j in range(hps.max_ep_len): local_seed = seed if seed is not None else [ np.random.randint(1e15), np.random.randint(1e15) ] # sample the local seed if hps.policy == 'thompson': a = thompson_policy(s[None, :], model, sess, hps, local_seed, hps.eval_on_mean_output, hps.eval_on_mean_params) elif hps.policy == 'egreedy': a = egreedy_policy(s[None, :], model, sess, hps, e=0.0, seed=local_seed) elif hps.policy == 'ucb': a = ucb_policy( s[None, :], model, sess, hps, local_seed ) #,hps.eval_on_mean_output,hps.eval_on_mean_params) a = a[0] #print(s,a) #print(hps.eval_on_mean_params,hps.eval_on_mean_output) s1, r, terminal, info = Env.step( correct_action_dim(a, model.action_dim)) if hps.visualize: Env.render() s1 = correct_dim(s1) s = s1 R += r if terminal: break R_ep.append(R) R_av.append(R / (j + 1)) return np.mean(R_ep), np.mean(R_av)
def update(self, sess, model, hps, ep): # clear plots for ax in self.pl: ax.clear() overall_means = np.zeros([hps.n_rep_visualize, self.n]) overall_max_dens = np.ones([self.n]) * -np.inf for k in range(hps.n_rep_visualize): # get prediction parameters seed = [np.random.randint(1e15), np.random.randint(1e15)] # new seed params = get_net_params(sess, model, self.sb, self.ab, seed, hps.p_dropout) means = get_net_mean(sess, model, self.sb, self.ab, seed, hps.p_dropout, output=hps.output) overall_means[k, :] = means[:, 0] #print(np.concatenate([np.array([0,0,1,1,2,2])[:,None],np.array([0,1,0,1,0,1])[:,None],params],axis=1)) # need to determine range if hps.output != 'categorical': if hps.output == 'gaussian': mu = params[:, 0] sigma = params[:, 1] elif hps.output == 'mog': mu = params[:, hps.n_mix:(hps.n_mix * 2)] sigma = params[:, (2 * hps.n_mix):(3 * hps.n_mix)] elif hps.output == 'deterministic': mu = params[:, 0] sigma = 1.0 max_sd = np.max(sigma) lower, upper = np.min(mu) - 3 * max_sd, np.max(mu) + 3 * max_sd else: lower, upper = model.transformer.plot_edges[ 0], model.transformer.plot_edges[-1] # update all plots x = np.linspace(lower, upper, 100) for i in range(self.n): #self.pl[i].set_xlim([lower,upper]) param = params[i, :] if hps.output == 'deterministic': max_dens = 1.0 overall_max_dens[i] = 1.0 mean = means[i] self.pl[i].plot([mean, mean], [0, max_dens], ':') else: if hps.output == 'gaussian' or hps.output == 'mog': if hps.output == 'gaussian': dens = norm.pdf(x, param[0], param[1]) elif hps.output == 'mog': dens = [ param[j] * norm.pdf(x, param[hps.n_mix + j], param[2 * hps.n_mix + j]) for j in range(hps.n_mix) ] dens = np.sum(np.array(dens), axis=0) #print(x,param,dens) self.pl[i].plot(x, dens, color='cornflowerblue') elif hps.output == 'categorical': dens = param edges = model.transformer.plot_edges self.pl[i].hist(model.transformer.means, bins=edges, weights=dens, color='cornflowerblue') overall_max_dens[i] = np.max( [overall_max_dens[i], np.max(dens)]) # add the mean grand_means = np.mean(np.array(overall_means), axis=0) seed = [np.random.randint(1e15), np.random.randint(1e15)] # new seed for parametric uncertainty grand_sds = analytic_sd(sess, model, self.sb, self.ab, seed, hps.p_dropout, hps.output) #grand_sds = np.ones([len(grand_means),1]) # get policy estimates s = np.arange(0, int(self.n / 2), 1)[:, None] a_thompson = np.array([ thompson_policy(s, model, sess, hps, seed, eval_on_mean_output=False, eval_on_mean_params=False) for i in range(100) ]) a_ucb = np.array([ ucb_policy(s, model, sess, hps, seed, eval_on_mean_output=False, eval_on_mean_params=False) for i in range(100) ]) thompson_probs = np.zeros(self.n) ucb_probs = np.zeros(self.n) for j, (state, action) in enumerate(zip(self.sb, self.ab)): thompson_probs[j] = np.mean(a_thompson[:, state, :] == action) ucb_probs[j] = np.mean(a_ucb[:, state, :] == action) for i in range(self.n): grand_mean = grand_means[i] grand_sd = grand_sds[i] max_dens = overall_max_dens[ i] #np.max(dens) if 'dens' in locals() else 1 self.pl[i].plot([grand_mean, grand_mean], [0, max_dens], '--', color='orange') #self.pl[i].plot([grand_mean-2*grand_sd,grand_mean+2*grand_sd],[max_dens/2,max_dens/2],'--',color='orange') self.pl[i].text(0.1, 0.75, '$\mu$={:0.2f}'.format(grand_mean), transform=self.pl[i].transAxes) self.pl[i].text(0.55, 0.75, '$\sigma$={:0.2f}'.format(grand_sds[i][0]), transform=self.pl[i].transAxes) #self.pl[i].text(0.1,0.75,'$\mu$={:0.2f}\n$\sigma$={:0.2f}'.format(grand_mean,grand_sds[i][0]),transform=self.pl[i].transAxes) #self.pl[i].text(0.55,0.75,'tho={:0.2f}\nucb={:0.2f}'.format(thompson_probs[i],ucb_probs[i]),transform=self.pl[i].transAxes) for j in range(int(self.n / 2)): for l in range(2): if self.truth[j] == l: val = 1. col = 'g' else: val = 0. col = 'r' self.ax[l, j].add_patch( patches.Rectangle((0.01, 0.01), 0.98, 0.98, linewidth=10, edgecolor=col, facecolor='none', transform=self.ax[l, j].transAxes)) if j > 0: plt.setp(self.ax[l, j].get_yticklabels(), visible=False) if l == 0: plt.setp(self.ax[l, j].get_xticklabels(), visible=False) #self.ax[l,j].set_title('V={:0.2f}'.format(val)) self.ax[l, j].set_ylim([0, 1.0]) self.ax[l, j].set_xlim([-2.5, 2.5]) self.fig.canvas.draw() self.fig.savefig(hps.result_dir + 'episode_{}'.format(ep), dpi=300) self.fig.canvas.flush_events()
def collect_data(hps, model, sess, Env, e): ''' Collects data ''' ep = 0 # episode counter t = 0 # timestep counter t_, R_mean, R_sum, data = [], [], [], [] # data per episode while ep < hps.n_ep_collect: terminal = False s = Env.reset() s = correct_dim(s) seed = [np.random.randint(1e15), np.random.randint(1e15)] if hps.fix_seed_per_roll_out else None rollout_data = PartialRollout() if seed is not None: rollout_data.seed = seed terminal = False while (not terminal) and (rollout_data.t < hps.max_ep_len): local_seed = seed if seed is not None else [ np.random.randint(1e15), np.random.randint(1e15) ] # sample the local seed # Policy try: if hps.policy == 'thompson': a = thompson_policy(s[None, :], model, sess, hps, seed=local_seed) elif hps.policy == 'egreedy': e = sess.run(model.epsilon) a = egreedy_policy(s[None, :], model, sess, hps, e=e, seed=local_seed) elif hps.policy == 'ucb': a = ucb_policy(s[None, :], model, sess, hps, seed=local_seed) a = a[0] except Exception as e: print('hps.policy = {}, s = {} exception = {}'.format( hps.policy, s, e)) # Steps s1, r, terminal, info = Env.step( correct_action_dim(a, model.action_discrete)) s1 = correct_dim(s1) rollout_data.add(s, a, [r]) s = s1 rollout_data.add_last_state(s, terminal) # add the last state data.append(rollout_data) ep += 1 t += rollout_data.t R_sum.append(rollout_data.r_sum) R_mean.append(rollout_data.r_sum / rollout_data.t) t_.append(rollout_data.t) #logger.debug('Episode reward {}'.format(rollout_data.r_sum)) return t, t_, R_mean, R_sum, data
def update(self, sess, model, hps, ep): # clear plots names = [ '$s_0,a_0$', '$s_0,a_1$', '$s_1,a_0$', '$s_1,a_1$', '$s_2,a_0$', '$s_2,a_1$' ] for i in range(6): self.pl[i].clear() self.pl[i].set_title(names[i], fontsize=22) overall_means = np.zeros([hps.n_rep_visualize, 6]) overall_max_dens = np.ones([6]) * -np.inf for k in range(hps.n_rep_visualize): # get prediction parameters seed = [np.random.randint(1e15), np.random.randint(1e15)] # new seed params = get_net_params(sess, model, self.sb, self.ab, seed, hps.p_dropout) means = get_net_mean(sess, model, self.sb, self.ab, seed, hps.p_dropout, output=hps.output) overall_means[k, :] = means[:, 0] #print(np.concatenate([np.array([0,0,1,1,2,2])[:,None],np.array([0,1,0,1,0,1])[:,None],params],axis=1)) # need to determine range if hps.output != 'categorical': if hps.output == 'gaussian': mu = params[:, 0] sigma = params[:, 1] elif hps.output == 'mog': mu = params[:, hps.n_mix:(hps.n_mix * 2)] sigma = params[:, (2 * hps.n_mix):(3 * hps.n_mix)] elif hps.output == 'deterministic': mu = params[:, 0] sigma = 1.0 max_sd = np.max(sigma) lower, upper = np.min(mu) - 3 * max_sd, np.max(mu) + 3 * max_sd else: lower, upper = model.transformer.plot_edges[ 0], model.transformer.plot_edges[-1] # update all plots x = np.linspace(lower, upper, 100) for i in range(6): #self.pl[i].set_xlim([lower,upper]) param = params[i, :] if hps.output == 'deterministic': max_dens = 1.0 overall_max_dens[i] = 1.0 mean = means[i] self.pl[i].plot([mean, mean], [0, max_dens], ':') else: if hps.output == 'gaussian' or hps.output == 'mog': if hps.output == 'gaussian': dens = norm.pdf(x, param[0], param[1]) elif hps.output == 'mog': dens = [ param[j] * norm.pdf(x, param[hps.n_mix + j], param[2 * hps.n_mix + j]) for j in range(hps.n_mix) ] dens = np.sum(np.array(dens), axis=0) #print(x,param,dens) self.pl[i].plot(x, dens, color='cornflowerblue') elif hps.output == 'categorical': dens = param edges = model.transformer.plot_edges self.pl[i].hist(model.transformer.means, bins=edges, weights=dens, color='cornflowerblue') overall_max_dens[i] = np.max( [overall_max_dens[i], np.max(dens)]) # add the mean grand_means = np.mean(np.array(overall_means), axis=0) seed = [np.random.randint(1e15), np.random.randint(1e15)] # new seed for parametric uncertainty grand_sds = analytic_sd(sess, model, self.sb, self.ab, seed, hps.p_dropout, hps.output) # get policy estimates s = np.array([[0], [1], [2]]) a_thompson = np.array([ thompson_policy(s, model, sess, hps, seed, eval_on_mean_output=False, eval_on_mean_params=False) for i in range(100) ]) a_ucb = np.array([ ucb_policy(s, model, sess, hps, seed, eval_on_mean_output=False, eval_on_mean_params=False) for i in range(100) ]) thompson_probs = np.zeros(6) ucb_probs = np.zeros(6) for j, (state, action) in enumerate(zip(self.sb, self.ab)): thompson_probs[j] = np.mean(a_thompson[:, state, :] == action) ucb_probs[j] = np.mean(a_ucb[:, state, :] == action) for i in range(6): grand_mean = grand_means[i] grand_sd = grand_sds[i] max_dens = overall_max_dens[ i] #np.max(dens) if 'dens' in locals() else 1 self.pl[i].plot([grand_mean, grand_mean], [0, max_dens], '--', color='orange') #self.pl[i].plot([grand_mean-2*grand_sd,grand_mean+2*grand_sd],[max_dens/2,max_dens/2],'--',color='orange') self.pl[i].text(0.05, 0.75, '$\mu=${:0.2f}\n$\sigma=${:0.2f}'.format( grand_mean, grand_sds[i][0]), transform=self.pl[i].transAxes, fontsize=19) self.pl[i].text(0.56, 0.75, 'tho={:0.2f}\nucb={:0.2f}'.format( thompson_probs[i], ucb_probs[i]), transform=self.pl[i].transAxes, fontsize=19) #self.pl[i].text(0.05,0.85,'truth = {}'.format(self.truth[i]),transform=self.pl[i].transAxes) self.pl[i].set_ylim([0, 3]) self.pl[i].set_xlim([lower, upper]) self.pl[i].set_xlim([-2, 7]) for spine in self.pl[i].spines.values(): spine.set_edgecolor('lightgrey') spine.set_linewidth(5) self.pl[i].grid(False) plt.xticks(fontsize=8) plt.yticks(fontsize=8) self.fig.canvas.draw() self.fig.savefig(hps.base_result_dir + 'episode_{}'.format(ep), dpi=300) self.fig.canvas.flush_events()