Python ucb_policy示例，rl.policies.ucb_policy Python示例

示例#1

0

显示文件

文件： duvn_agent.py 项目： tmoer/return_distribution_exploration

def roll_out(hps, Env, model, sess, s, e, seed):
    ''' Performs a simple rollout '''
    rollout_data = PartialRollout()
    if seed is not None: rollout_data.seed = seed
    while (not rollout_data.terminal) and (rollout_data.t < hps.max_roll_len):
        local_seed = seed if seed is not None else [
            np.random.randint(1e15),
            np.random.randint(1e15)
        ]  # sample the local seed
        # Policy
        if hps.policy == 'thompson':
            a = thompson_policy(s[None, :], model, sess, hps, seed=local_seed)
        elif hps.policy == 'egreedy':
            e = sess.run(model.epsilon)
            a = egreedy_policy(s[None, :],
                               model,
                               sess,
                               hps,
                               e=e,
                               seed=local_seed)
        elif hps.policy == 'ucb':
            a = ucb_policy(s[None, :], model, sess, hps, seed=local_seed)
        a = a[0]

        # Steps
        s1, r, terminal, info = Env.step(
            correct_action_dim(a, model.action_discrete))
        s1 = correct_dim(s1)
        rollout_data.add(s, a, [r], terminal)
        s = s1
    rollout_data.add_last_state(s)  # add the last state
    return rollout_data, s

示例#2

0

显示文件

def offpolicy_argmax(sess,model,sb,action_dim,seed,hps,off_policy_on_mean):
    ''' returns the action index for the off policy decision 
    important parameter: if off_policy_on_mean = True, then we consider the mean of the output'''
        
    if hps.policy == 'thompson':
        a = thompson_policy(sb,model,sess,hps,seed,eval_on_mean_output=off_policy_on_mean,eval_on_mean_params=False) 
    elif hps.policy == 'egreedy':
        e = 0.0
        a = egreedy_policy(sb,model,sess,hps,e=e,seed=seed)
    elif hps.policy == 'ucb':
        a = ucb_policy(sb,model,sess,hps,seed,eval_on_mean_output=off_policy_on_mean,eval_on_mean_params=False) 
    return a

示例#3

0

显示文件

文件： duvn_agent.py 项目： tmoer/return_distribution_exploration

def evaluate(Env, hps, model, sess):
    R_ep = []
    R_av = []
    for i in range(hps.n_eval):
        s = Env.reset()
        s = correct_dim(s)
        R = 0
        seed = [np.random.randint(1e15),
                np.random.randint(1e15)] if hps.fix_seed_per_roll_out else None
        for j in range(hps.max_ep_len):
            local_seed = seed if seed is not None else [
                np.random.randint(1e15),
                np.random.randint(1e15)
            ]  # sample the local seed
            if hps.policy == 'thompson':
                a = thompson_policy(s[None, :], model, sess, hps, local_seed,
                                    hps.eval_on_mean_output,
                                    hps.eval_on_mean_params)
            elif hps.policy == 'egreedy':
                a = egreedy_policy(s[None, :],
                                   model,
                                   sess,
                                   hps,
                                   e=0.0,
                                   seed=local_seed)
            elif hps.policy == 'ucb':
                a = ucb_policy(
                    s[None, :], model, sess, hps, local_seed
                )  #,hps.eval_on_mean_output,hps.eval_on_mean_params)
            a = a[0]
            #print(s,a)
            #print(hps.eval_on_mean_params,hps.eval_on_mean_output)

            s1, r, terminal, info = Env.step(
                correct_action_dim(a, model.action_dim))
            if hps.visualize:
                Env.render()
            s1 = correct_dim(s1)
            s = s1
            R += r
            if terminal:
                break
        R_ep.append(R)
        R_av.append(R / (j + 1))
    return np.mean(R_ep), np.mean(R_av)

示例#4

0

显示文件

    def update(self, sess, model, hps, ep):
        # clear plots
        for ax in self.pl:
            ax.clear()
        overall_means = np.zeros([hps.n_rep_visualize, self.n])
        overall_max_dens = np.ones([self.n]) * -np.inf
        for k in range(hps.n_rep_visualize):
            # get prediction parameters
            seed = [np.random.randint(1e15),
                    np.random.randint(1e15)]  # new seed
            params = get_net_params(sess, model, self.sb, self.ab, seed,
                                    hps.p_dropout)
            means = get_net_mean(sess,
                                 model,
                                 self.sb,
                                 self.ab,
                                 seed,
                                 hps.p_dropout,
                                 output=hps.output)
            overall_means[k, :] = means[:, 0]
            #print(np.concatenate([np.array([0,0,1,1,2,2])[:,None],np.array([0,1,0,1,0,1])[:,None],params],axis=1))

            # need to determine range
            if hps.output != 'categorical':
                if hps.output == 'gaussian':
                    mu = params[:, 0]
                    sigma = params[:, 1]
                elif hps.output == 'mog':
                    mu = params[:, hps.n_mix:(hps.n_mix * 2)]
                    sigma = params[:, (2 * hps.n_mix):(3 * hps.n_mix)]
                elif hps.output == 'deterministic':
                    mu = params[:, 0]
                    sigma = 1.0

                max_sd = np.max(sigma)
                lower, upper = np.min(mu) - 3 * max_sd, np.max(mu) + 3 * max_sd
            else:
                lower, upper = model.transformer.plot_edges[
                    0], model.transformer.plot_edges[-1]

            # update all plots
            x = np.linspace(lower, upper, 100)
            for i in range(self.n):
                #self.pl[i].set_xlim([lower,upper])
                param = params[i, :]
                if hps.output == 'deterministic':
                    max_dens = 1.0
                    overall_max_dens[i] = 1.0
                    mean = means[i]
                    self.pl[i].plot([mean, mean], [0, max_dens], ':')
                else:
                    if hps.output == 'gaussian' or hps.output == 'mog':
                        if hps.output == 'gaussian':
                            dens = norm.pdf(x, param[0], param[1])
                        elif hps.output == 'mog':
                            dens = [
                                param[j] * norm.pdf(x, param[hps.n_mix + j],
                                                    param[2 * hps.n_mix + j])
                                for j in range(hps.n_mix)
                            ]
                            dens = np.sum(np.array(dens), axis=0)
                        #print(x,param,dens)
                        self.pl[i].plot(x, dens, color='cornflowerblue')
                    elif hps.output == 'categorical':
                        dens = param
                        edges = model.transformer.plot_edges
                        self.pl[i].hist(model.transformer.means,
                                        bins=edges,
                                        weights=dens,
                                        color='cornflowerblue')
                    overall_max_dens[i] = np.max(
                        [overall_max_dens[i],
                         np.max(dens)])
        # add the mean
        grand_means = np.mean(np.array(overall_means), axis=0)
        seed = [np.random.randint(1e15),
                np.random.randint(1e15)]  # new seed for parametric uncertainty
        grand_sds = analytic_sd(sess, model, self.sb, self.ab, seed,
                                hps.p_dropout, hps.output)
        #grand_sds = np.ones([len(grand_means),1])

        # get policy estimates
        s = np.arange(0, int(self.n / 2), 1)[:, None]
        a_thompson = np.array([
            thompson_policy(s,
                            model,
                            sess,
                            hps,
                            seed,
                            eval_on_mean_output=False,
                            eval_on_mean_params=False) for i in range(100)
        ])
        a_ucb = np.array([
            ucb_policy(s,
                       model,
                       sess,
                       hps,
                       seed,
                       eval_on_mean_output=False,
                       eval_on_mean_params=False) for i in range(100)
        ])

        thompson_probs = np.zeros(self.n)
        ucb_probs = np.zeros(self.n)

        for j, (state, action) in enumerate(zip(self.sb, self.ab)):
            thompson_probs[j] = np.mean(a_thompson[:, state, :] == action)
            ucb_probs[j] = np.mean(a_ucb[:, state, :] == action)

        for i in range(self.n):
            grand_mean = grand_means[i]
            grand_sd = grand_sds[i]
            max_dens = overall_max_dens[
                i]  #np.max(dens) if 'dens' in locals() else 1
            self.pl[i].plot([grand_mean, grand_mean], [0, max_dens],
                            '--',
                            color='orange')
            #self.pl[i].plot([grand_mean-2*grand_sd,grand_mean+2*grand_sd],[max_dens/2,max_dens/2],'--',color='orange')
            self.pl[i].text(0.1,
                            0.75,
                            '$\mu$={:0.2f}'.format(grand_mean),
                            transform=self.pl[i].transAxes)
            self.pl[i].text(0.55,
                            0.75,
                            '$\sigma$={:0.2f}'.format(grand_sds[i][0]),
                            transform=self.pl[i].transAxes)

            #self.pl[i].text(0.1,0.75,'$\mu$={:0.2f}\n$\sigma$={:0.2f}'.format(grand_mean,grand_sds[i][0]),transform=self.pl[i].transAxes)
            #self.pl[i].text(0.55,0.75,'tho={:0.2f}\nucb={:0.2f}'.format(thompson_probs[i],ucb_probs[i]),transform=self.pl[i].transAxes)

        for j in range(int(self.n / 2)):
            for l in range(2):
                if self.truth[j] == l:
                    val = 1.
                    col = 'g'
                else:
                    val = 0.
                    col = 'r'
                self.ax[l, j].add_patch(
                    patches.Rectangle((0.01, 0.01),
                                      0.98,
                                      0.98,
                                      linewidth=10,
                                      edgecolor=col,
                                      facecolor='none',
                                      transform=self.ax[l, j].transAxes))
                if j > 0:
                    plt.setp(self.ax[l, j].get_yticklabels(), visible=False)
                if l == 0:
                    plt.setp(self.ax[l, j].get_xticklabels(), visible=False)
                #self.ax[l,j].set_title('V={:0.2f}'.format(val))
                self.ax[l, j].set_ylim([0, 1.0])
                self.ax[l, j].set_xlim([-2.5, 2.5])

        self.fig.canvas.draw()
        self.fig.savefig(hps.result_dir + 'episode_{}'.format(ep), dpi=300)
        self.fig.canvas.flush_events()

示例#5

0

显示文件

文件： duvn_agent.py 项目： tmoer/return_distribution_exploration

def collect_data(hps, model, sess, Env, e):
    ''' Collects data '''
    ep = 0  # episode counter
    t = 0  # timestep counter
    t_, R_mean, R_sum, data = [], [], [], []  # data per episode

    while ep < hps.n_ep_collect:
        terminal = False
        s = Env.reset()
        s = correct_dim(s)
        seed = [np.random.randint(1e15),
                np.random.randint(1e15)] if hps.fix_seed_per_roll_out else None
        rollout_data = PartialRollout()
        if seed is not None:
            rollout_data.seed = seed
        terminal = False

        while (not terminal) and (rollout_data.t < hps.max_ep_len):
            local_seed = seed if seed is not None else [
                np.random.randint(1e15),
                np.random.randint(1e15)
            ]  # sample the local seed
            # Policy
            try:
                if hps.policy == 'thompson':
                    a = thompson_policy(s[None, :],
                                        model,
                                        sess,
                                        hps,
                                        seed=local_seed)
                elif hps.policy == 'egreedy':
                    e = sess.run(model.epsilon)
                    a = egreedy_policy(s[None, :],
                                       model,
                                       sess,
                                       hps,
                                       e=e,
                                       seed=local_seed)
                elif hps.policy == 'ucb':
                    a = ucb_policy(s[None, :],
                                   model,
                                   sess,
                                   hps,
                                   seed=local_seed)
                a = a[0]
            except Exception as e:
                print('hps.policy = {}, s = {} exception = {}'.format(
                    hps.policy, s, e))

            # Steps
            s1, r, terminal, info = Env.step(
                correct_action_dim(a, model.action_discrete))
            s1 = correct_dim(s1)
            rollout_data.add(s, a, [r])
            s = s1
        rollout_data.add_last_state(s, terminal)  # add the last state

        data.append(rollout_data)
        ep += 1
        t += rollout_data.t

        R_sum.append(rollout_data.r_sum)
        R_mean.append(rollout_data.r_sum / rollout_data.t)
        t_.append(rollout_data.t)

        #logger.debug('Episode reward {}'.format(rollout_data.r_sum))
    return t, t_, R_mean, R_sum, data

示例#6

0

显示文件

    def update(self, sess, model, hps, ep):
        # clear plots
        names = [
            '$s_0,a_0$', '$s_0,a_1$', '$s_1,a_0$', '$s_1,a_1$', '$s_2,a_0$',
            '$s_2,a_1$'
        ]
        for i in range(6):
            self.pl[i].clear()
            self.pl[i].set_title(names[i], fontsize=22)

        overall_means = np.zeros([hps.n_rep_visualize, 6])
        overall_max_dens = np.ones([6]) * -np.inf
        for k in range(hps.n_rep_visualize):
            # get prediction parameters
            seed = [np.random.randint(1e15),
                    np.random.randint(1e15)]  # new seed
            params = get_net_params(sess, model, self.sb, self.ab, seed,
                                    hps.p_dropout)
            means = get_net_mean(sess,
                                 model,
                                 self.sb,
                                 self.ab,
                                 seed,
                                 hps.p_dropout,
                                 output=hps.output)
            overall_means[k, :] = means[:, 0]
            #print(np.concatenate([np.array([0,0,1,1,2,2])[:,None],np.array([0,1,0,1,0,1])[:,None],params],axis=1))

            # need to determine range
            if hps.output != 'categorical':
                if hps.output == 'gaussian':
                    mu = params[:, 0]
                    sigma = params[:, 1]
                elif hps.output == 'mog':
                    mu = params[:, hps.n_mix:(hps.n_mix * 2)]
                    sigma = params[:, (2 * hps.n_mix):(3 * hps.n_mix)]
                elif hps.output == 'deterministic':
                    mu = params[:, 0]
                    sigma = 1.0

                max_sd = np.max(sigma)
                lower, upper = np.min(mu) - 3 * max_sd, np.max(mu) + 3 * max_sd
            else:
                lower, upper = model.transformer.plot_edges[
                    0], model.transformer.plot_edges[-1]

            # update all plots
            x = np.linspace(lower, upper, 100)
            for i in range(6):
                #self.pl[i].set_xlim([lower,upper])
                param = params[i, :]
                if hps.output == 'deterministic':
                    max_dens = 1.0
                    overall_max_dens[i] = 1.0
                    mean = means[i]
                    self.pl[i].plot([mean, mean], [0, max_dens], ':')
                else:
                    if hps.output == 'gaussian' or hps.output == 'mog':
                        if hps.output == 'gaussian':
                            dens = norm.pdf(x, param[0], param[1])
                        elif hps.output == 'mog':
                            dens = [
                                param[j] * norm.pdf(x, param[hps.n_mix + j],
                                                    param[2 * hps.n_mix + j])
                                for j in range(hps.n_mix)
                            ]
                            dens = np.sum(np.array(dens), axis=0)
                        #print(x,param,dens)
                        self.pl[i].plot(x, dens, color='cornflowerblue')
                    elif hps.output == 'categorical':
                        dens = param
                        edges = model.transformer.plot_edges
                        self.pl[i].hist(model.transformer.means,
                                        bins=edges,
                                        weights=dens,
                                        color='cornflowerblue')
                    overall_max_dens[i] = np.max(
                        [overall_max_dens[i],
                         np.max(dens)])
        # add the mean
        grand_means = np.mean(np.array(overall_means), axis=0)
        seed = [np.random.randint(1e15),
                np.random.randint(1e15)]  # new seed for parametric uncertainty
        grand_sds = analytic_sd(sess, model, self.sb, self.ab, seed,
                                hps.p_dropout, hps.output)

        # get policy estimates
        s = np.array([[0], [1], [2]])
        a_thompson = np.array([
            thompson_policy(s,
                            model,
                            sess,
                            hps,
                            seed,
                            eval_on_mean_output=False,
                            eval_on_mean_params=False) for i in range(100)
        ])
        a_ucb = np.array([
            ucb_policy(s,
                       model,
                       sess,
                       hps,
                       seed,
                       eval_on_mean_output=False,
                       eval_on_mean_params=False) for i in range(100)
        ])

        thompson_probs = np.zeros(6)
        ucb_probs = np.zeros(6)

        for j, (state, action) in enumerate(zip(self.sb, self.ab)):
            thompson_probs[j] = np.mean(a_thompson[:, state, :] == action)
            ucb_probs[j] = np.mean(a_ucb[:, state, :] == action)

        for i in range(6):
            grand_mean = grand_means[i]
            grand_sd = grand_sds[i]
            max_dens = overall_max_dens[
                i]  #np.max(dens) if 'dens' in locals() else 1
            self.pl[i].plot([grand_mean, grand_mean], [0, max_dens],
                            '--',
                            color='orange')
            #self.pl[i].plot([grand_mean-2*grand_sd,grand_mean+2*grand_sd],[max_dens/2,max_dens/2],'--',color='orange')
            self.pl[i].text(0.05,
                            0.75,
                            '$\mu=${:0.2f}\n$\sigma=${:0.2f}'.format(
                                grand_mean, grand_sds[i][0]),
                            transform=self.pl[i].transAxes,
                            fontsize=19)
            self.pl[i].text(0.56,
                            0.75,
                            'tho={:0.2f}\nucb={:0.2f}'.format(
                                thompson_probs[i], ucb_probs[i]),
                            transform=self.pl[i].transAxes,
                            fontsize=19)
            #self.pl[i].text(0.05,0.85,'truth = {}'.format(self.truth[i]),transform=self.pl[i].transAxes)
            self.pl[i].set_ylim([0, 3])
            self.pl[i].set_xlim([lower, upper])
            self.pl[i].set_xlim([-2, 7])
            for spine in self.pl[i].spines.values():
                spine.set_edgecolor('lightgrey')
                spine.set_linewidth(5)
            self.pl[i].grid(False)

        plt.xticks(fontsize=8)
        plt.yticks(fontsize=8)
        self.fig.canvas.draw()
        self.fig.savefig(hps.base_result_dir + 'episode_{}'.format(ep),
                         dpi=300)
        self.fig.canvas.flush_events()