예제 #1
0
def save(args):

    mdp, obs_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    agent = DiaynAgent(sess=None,
                       obs_dim=obs_dim,
                       num_actions=num_actions,
                       num_options=args.noptions,
                       action_dim=action_dim,
                       action_bound=action_bound,
                       batch_size=32,
                       update_freq=32,
                       alpha=1.0)

    agent.set_diversity(True)

    run_agents_on_mdp([agent],
                      mdp,
                      episodes=args.snepisodes,
                      steps=args.snsteps,
                      instances=1,
                      cumulative_plot=True)

    if args.trajdir == '__default':
        prefix = '.'
    else:
        prefix = args.trajdir

    agent.save(directory=prefix + '/vis' + '/' + str(args.task) + 'option' +
               str(args.noptions) + 'diayn',
               name='diayn-pretrain')
def main(open_plot=True):
    # TODO: Refactor and combine visualize_visitation, visualize_option, visualize_option_trajectory?

    # Plot the visitation statistics

    args = arguments()

    # Random seeds
    np.random.seed(1234)
    tf.set_random_seed(args.rseed)

    print('task=', args.task)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    if args.restoretraj:
        # bfr = ExperienceBuffer()
        # bfr.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.rseed) + '/' + 'traj')
        low_bfr = ExperienceBuffer()
        if args.reverse:
            low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                            str(args.noptions) + 'rev_' +
                            str(args.ffuncnunit) + '_' + str(args.rseed) +
                            '/' + 'low_traj')
        else:
            low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                            str(args.noptions) + '_' + str(args.ffuncnunit) +
                            '_' + str(args.rseed) + '/' + 'low_traj')
    else:
        _, low_bfr = sample_option_trajectories(mdp,
                                                args,
                                                noptions=args.noptions)

    # TODO: Print a list of states

    samples = low_bfr.buffer

    size = low_bfr.size()

    cur_o = None
    traj = [samples[i][0] for i in range(size)]

    if args.reverse:
        plot_visitation(traj,
                        mdp,
                        args,
                        filename=args.basedir + '/vis/' + args.task +
                        'option' + str(args.noptions) + 'rev_' +
                        str(args.ffuncnunit) + '_' + str(args.rseed) + '/' +
                        'visitations' + '.pdf')
    else:
        plot_visitation(traj,
                        mdp,
                        args,
                        filename=args.basedir + '/vis/' + args.task +
                        'option' + str(args.noptions) + '_' +
                        str(args.ffuncnunit) + '_' + str(args.rseed) + '/' +
                        'visitations' + '.pdf')
예제 #3
0
def main(open_plot=True):

    args = arguments()

    # Random seeds
    np.random.seed(1234)
    tf.set_random_seed(args.rseed)

    print('task=', args.task)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    if args.restoretraj:
        # bfr = ExperienceBuffer()
        # bfr.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.rseed) + '/' + 'traj')
        low_bfr = ExperienceBuffer()
        if args.reverse:
            low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                            str(args.noptions) + 'rev_' +
                            str(args.ffuncnunit) + '_' + str(args.rseed) +
                            '/' + 'low_traj')
        else:
            low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                            str(args.noptions) + '_' + str(args.ffuncnunit) +
                            '_' + str(args.rseed) + '/' + 'low_traj')
    else:
        _, low_bfr = sample_option_trajectories(mdp,
                                                args,
                                                noptions=args.noptions)

        print('sampled')
    # TODO: Print a list of states

    size = low_bfr.size()

    op = OptionWrapper(sess=None,
                       experience_buffer=None,
                       obs_dim=state_dim,
                       obs_bound=mdp.bounds(),
                       num_actions=num_actions,
                       action_dim=action_dim,
                       action_bound=action_bound,
                       low_method=args.lowmethod,
                       f_func=args.ffunction,
                       n_units=args.ffuncnunit,
                       init_all=args.initall,
                       restore=True,
                       name='option' + str(args.noptions) + '_' +
                       str(args.ffuncnunit) + '_' + str(args.rseed))
    op.restore(args.basedir + '/vis/' + args.task + 'option' +
               str(args.noptions) + '_' + str(args.ffuncnunit) + '_' +
               str(args.rseed))

    filename = args.basedir + '/vis/' + args.task + 'option' + str(
        args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(
            args.rseed) + '/' + 'fvalues.pdf'

    plot_fvalue(low_bfr, op, filename=filename)
예제 #4
0
def main():
    args = arguments()

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    bfr, low_bfr = sample_option_trajectories(mdp,
                                              args,
                                              noptions=args.noptions)

    # TODO: Trajectories are generated using noptions-1 options.

    if args.reverse:
        bfr.save(args.basedir + '/vis/' + args.task + 'option' +
                 str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' +
                 str(args.rseed) + '/' + 'traj')
        low_bfr.save(args.basedir + '/vis/' + args.task + 'option' +
                     str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' +
                     str(args.rseed) + '/' + 'low_traj')
    else:
        bfr.save(args.basedir + '/vis/' + args.task + 'option' +
                 str(args.noptions) + '_' + str(args.ffuncnunit) + '_' +
                 str(args.rseed) + '/' + 'traj')
        low_bfr.save(args.basedir + '/vis/' + args.task + 'option' +
                     str(args.noptions) + '_' + str(args.ffuncnunit) + '_' +
                     str(args.rseed) + '/' + 'low_traj')

    print('bfr  size=', bfr.size())
    print('lbfr size=', low_bfr.size())

    if args.task == 'PointMaze-v0':
        s, a, r, s, t = low_bfr.sample(20)
        for state in s:
            # print('s=', state) # TODO: how do we get the X, Y coordinates of the agent?
            print('x,y=', state.data[0], state.data[1])

    if args.task == 'MontezumaRevenge-ram-v0':
        s, a, r, s, t = low_bfr.sample(20)

        def getByte(ram, row, col):
            row = int(row, 16) - 8
            col = int(col, 16)
            return ram[row * 16 + col]

        for state in s:
            x = int(getByte(state.data, 'a', 'a'))
            y = int(getByte(state.data, 'a', 'b'))

            x_img = int(210.0 * (float(x) - 1) / float((9 * 16 + 8) - 1))
            y_img = int(160.0 * (float(y) - (8 * 16 + 6)) /
                        float((15 * 16 + 15) - (8 * 16 + 6)))

            print('(ram) x, y =', x, y)
            print('(img) x, y =', x_img, y_img)
예제 #5
0
def restore(args):

    mdp, obs_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    rst = DiaynAgent(sess=None,
                     obs_dim=obs_dim,
                     num_actions=num_actions,
                     action_dim=action_dim,
                     action_bound=action_bound,
                     num_options=args.noptions,
                     batch_size=1,
                     update_freq=1,
                     alpha=1.0)
    rst.restore(directory=prefix + '/vis' + '/' + str(args.task) + 'option' +
                str(args.noptions) + 'diayn',
                name='diayn-pretrain')

    rst.set_diversity(False)

    oagent = OptionAgent(sess=None,
                         obs_dim=obs_dim,
                         obs_bound=state_bound,
                         num_actions=num_actions,
                         action_dim=action_dim,
                         action_bound=action_bound,
                         num_options=1 + args.noptions,
                         init_all=args.initall,
                         high_method=args.highmethod,
                         low_method=args.lowmethod,
                         f_func=args.ffunction,
                         batch_size=args.batchsize,
                         buffer_size=args.buffersize,
                         low_update_freq=args.lowupdatefreq,
                         option_batch_size=args.obatchsize,
                         option_buffer_size=args.obuffersize,
                         high_update_freq=args.highupdatefreq,
                         name='diayn' + str(args.noptions))

    for i in range(args.noptions):
        op = DiaynOption(rst, i, args.termprob)
        oagent.add_option(op)

    run_agents_on_mdp([oagent],
                      mdp,
                      episodes=args.nepisodes,
                      steps=args.nsteps,
                      instances=args.ninstances,
                      cumulative_plot=True)
예제 #6
0
def main(open_plot=True):

    args = arguments()
    
    # Random seeds
    np.random.seed(1234)
    tf.set_random_seed(args.rseed)

    print('task=', args.task)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(args)



    if args.online:
        # TODO: Think how to solve the restoration for batch normalization.
        op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, restore=True, name='online-option' +  str(args.noptions) + '_' + str(args.ffuncnunit))
        op.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed))
        plot_eigenfunction(op, args, xind=0, yind=1, filename=args.basedir + '/vis/' + args.task + 'online-option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '/' + 'eigenfunc.pdf')
    else:
        op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, restore=True, name='option' +  str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed))
        op.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed))
        plot_eigenfunction(op, args, xind=0, yind=1, filename=args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'eigenfunc.pdf')
예제 #7
0
def plot_eigenfunction(op, args, xind=0, yind=1, filename='visualize_ef.pdf'):
    # Pinball

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(args)
    
    n_samples = 2000
    
    low_bound = state_bound[0]
    up_bound = state_bound[1]

    if args.task == 'AntMaze-v0' or args.task == 'PointMaze-v0':
        low_bound[xind] = 0.0
        low_bound[yind] = 0.0
        up_bound[xind] = 8.0 * 3.0
        up_bound[yind] = 8.0 * 3.0

    if args.tasktype == 'atari':
        low_bound[xind] = 0.0
        low_bound[yind] = 0.0
        up_bound[xind] = 160.0
        up_bound[yind] = 210.0

    xs = []
    ys = []
    fs = []

    # if np.isinf(low_bound).any() or np.isinf(up_bound).any():
    #     bfr = sample_option_trajectories(mdp, args, noptions=0)
    # 
    #     ss, _, _, _, _ = bfr.sample(n_samples)
    # 
    #     max_x = float('-inf')
    #     min_x = float('inf')
    #     max_y = float('-inf')
    #     min_y = float('inf')
    #     for i in range(n_samples):
    #         x = ss[i].data[xind]
    #         y = ss[i].data[yind]
    #         max_x = max(x, max_x)
    #         min_x = min(x, min_x)
    #         max_y = max(y, max_y)
    #         min_y = min(y, min_y)
    #     low_bound[xind] = min_x
    #     up_bound[xind] = max_x
    #     low_bound[yind] = min_y
    #     up_bound[yind] = max_y

    # TODO: Implement a script to plot the f-value of the states
    #       visited by the agent instead of sampling uniform randomly.

    if args.restoretraj:
        print('restoring buffer from ' + './vis/' + args.task + 'option' + str(args.noptions - 1) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj')
        bfr = ExperienceBuffer()
        bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions - 1) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj')
        bfr_size = bfr.size()
        print('bfr_size=', bfr_size) # TODO: parameter?

        samples, _, _, _, _ = bfr.sample(n_samples)
        # samples = [bfr.buffer[i][0] for i in range(min(bfr.size(), n_samples))]

        if args.task == 'MontezumaRevenge-ram-v0':
            feature = Monte()
            xs = [feature.feature(s, 0)[0] for s in samples]
            ys = [feature.feature(s, 0)[1] for s in samples]
        elif args.ffunction == 'nns':
            feature = Subset(state_dim, [0, 1])
            xs = [feature.feature(s, 0)[0] for s in samples]
            ys = [feature.feature(s, 0)[1] for s in samples]
        else:
            xs = [s.data[xind] for s in samples]
            ys = [s.data[yind] for s in samples]


         
    else:        
        xs = [random.uniform(low_bound[xind], up_bound[xind]) for _ in range(n_samples)]
        ys = [random.uniform(low_bound[yind], up_bound[yind]) for _ in range(n_samples)]

    fs = []
    
    for i in range(len(xs)):
        if args.task == 'MontezumaRevenge-ram-v0':
            obs = np.array([xs[i], ys[i]])
            obs = np.reshape(obs, (1, 2))
            f_value = op.f_function.f_from_features(obs)[0][0]
        elif args.ffunction == 'nns':
            obs = np.array([xs[i], ys[i]])
            obs = np.reshape(obs, (1, 2))
            f_value = op.f_function.f_from_features(obs)[0][0]
        else:
            s = mdp.get_init_state()
            s.data[xind] = xs[i]
            s.data[yind] = ys[i]
            f_value = op.f_function(s)[0][0]            
        fs.append(f_value)

    # TODO: What is the best colormap for all people (including color blinds?) but still appealing for majority?
    #       bwr looks useful, but may be misleading?.
    cmap = matplotlib.cm.get_cmap('plasma')
    normalize = matplotlib.colors.Normalize(vmin=min(fs), vmax=max(fs))
    colors = [cmap(normalize(value)) for value in fs]
    # colors_np = np.asarray(colors)

    fig, ax = plt.subplots(figsize=(8, 6))
    ax.scatter(x=xs, y=ys, c=colors)

    cax, _ = matplotlib.colorbar.make_axes(ax)
    cbar = matplotlib.colorbar.ColorbarBase(cax, cmap=cmap, norm=normalize)

    term_th = op.lower_th
    cax.plot([0, 1], [term_th] * 2, 'k')

    term, nonterm = 0, 0
    for f in fs:
        if f < term_th:
            term += 1
        else:
            nonterm += 1
    print(term, 'terms', nonterm, 'nonterms')
    # TODO: Only for pinball domains. What to do for MuJoCo?
    # Obstacles
    if args.tasktype == 'pinball':
        for obs in mdp.domain.environment.obstacles:
            point_list = obs.points
            xlist = []
            ylist = []
            for p in point_list:
                xlist.append(p[0])
                ylist.append(p[1])

            ax.fill(xlist, ylist, 'k')
            
    elif args.task == 'PointMaze-v0' or args.task == 'AntMaze-v0':
        # TODO: (x,y) coordinates start at 0, 0.
        #       How is the coordinates signed?
        maze = [[1, 1, 1, 1, 1],
        [1, 0, 0, 0, 1],
        [1, 1, 1, 0, 1],
        [1, 0, 0, 0, 1],
        [1, 1, 1, 1, 1]]
        scale = 8.0
        for y in range(5):
            for x in range(5):
                if maze[y][x] == 1:
                    # We decrement x and y because the (0, 0)-coordinate is set at (1, 1) position in the maze.
                    xbase, ybase = scale * (x - 1), scale * (y - 1)
                    xlist = [xbase, xbase + scale, xbase + scale, xbase]
                    ylist = [ybase, ybase, ybase + scale, ybase + scale]
                    ax.fill(xlist, ylist, 'k')
    elif args.task == 'MontezumaRevenge-ram-v0':
        # TODO: Show the background of the Monte?
        img = imread('./montezuma.jpg')
        ax.imshow(img, zorder=0, extent=[0, 160, 0, 210])
    
    plt.savefig(filename)
    plt.close()
예제 #8
0
def main(open_plot=True):
    # TODO: Accept a set of options instead of just one
    args = arguments()

    # Random seeds
    np.random.seed(args.rseed)
    tf.set_random_seed(args.rseed)

    print('tasktype=', args.tasktype)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    oagent = OptionAgent(sess=None,
                         obs_dim=state_dim,
                         obs_bound=state_bound,
                         num_actions=num_actions,
                         action_dim=action_dim,
                         action_bound=action_bound,
                         num_options=1 + args.noptions,
                         init_all=args.initall,
                         high_method=args.highmethod,
                         low_method=args.lowmethod,
                         f_func=args.ffunction,
                         batch_size=args.batchsize,
                         buffer_size=args.buffersize,
                         low_update_freq=args.lowupdatefreq,
                         option_batch_size=args.obatchsize,
                         option_buffer_size=args.obuffersize,
                         high_update_freq=args.highupdatefreq,
                         name='op')
    oagent.reset()

    for nop in range(1, args.noptions + 1):
        op = OptionWrapper(sess=None,
                           experience_buffer=None,
                           obs_dim=state_dim,
                           obs_bound=mdp.bounds(),
                           num_actions=num_actions,
                           action_dim=action_dim,
                           action_bound=action_bound,
                           low_method=args.lowmethod,
                           f_func=args.ffunction,
                           init_all=args.initall,
                           restore=True,
                           name='option' + str(nop) + '_' +
                           str(args.ffuncnunit) + '_' + str(args.rseed))

        if args.trajdir == '__default':
            if args.reverse:
                opdir = './vis/' + args.task + 'option' + str(
                    nop) + 'rev_' + str(args.ffuncnunit) + '_' + str(
                        args.rseed)
            else:
                opdir = './vis/' + args.task + 'option' + str(nop) + '_' + str(
                    args.ffuncnunit) + '_' + str(args.rseed)
        else:
            # Only one option can be restored from nonstandard locations
            assert (args.noptions == 1)
            opdir = args.trajdir
        op.restore(opdir)
        print('restored option', opdir)
        # print('upper_th=', op.upper_th)
        oagent.add_option(op)

    agents = []
    agents.append(oagent)

    if args.base:
        base = OptionAgent(sess=None,
                           obs_dim=state_dim,
                           obs_bound=state_bound,
                           num_actions=num_actions,
                           action_dim=action_dim,
                           action_bound=action_bound,
                           num_options=1,
                           high_method=args.highmethod,
                           low_method=args.lowmethod,
                           f_func=args.ffunction,
                           batch_size=args.batchsize,
                           buffer_size=args.buffersize,
                           low_update_freq=args.lowupdatefreq,
                           option_batch_size=1,
                           option_buffer_size=2,
                           high_update_freq=10000000,
                           init_all=args.initall,
                           name='base')
        agents.append(base)

    mdp.reset()

    # TODO: We need to count the number of times the agent reached the goal state.
    #       Because from the cumulative rewards, it is hard to see if the agent is performing as intended.
    #       Possible Solutions: (See the previous works first)
    #         1. Plot the number of times the agent reached the goal.
    #         2. Give a positive reward when it reached the goal
    run_agents_on_mdp(agents,
                      mdp,
                      episodes=args.nepisodes,
                      steps=args.nsteps,
                      instances=args.ninstances,
                      cumulative_plot=True)

    options = oagent.options
    for nop in range(1, len(options)):
        if args.trajdir == '__default':
            opdir = './vis/' + args.task + 'option' + str(nop) + '_' + str(
                args.ffuncnunit) + '_' + str(args.rseed)
        else:
            assert (args.noptions == 1)
            opdir = args.trajdir
        # print('upper=', options[nop].upper_th)
        options[nop].save(opdir + '_trained')

    if args.trajdir == '__default':
        bufdir = './vis/' + args.task + 'option' + str(
            args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)
    else:
        bufdir = args.trajdir
    oagent.option_buffer.save(bufdir + '_trained' + '/' + 'traj')
    oagent.experience_buffer.save(bufdir + '_trained' + '/' + 'low_traj')
예제 #9
0
def main(open_plot=True):

    args = arguments()

    # Random seeds
    np.random.seed(1234)
    tf.set_random_seed(args.rseed)

    print('task=', args.task)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    if args.restoretraj:
        # bfr = ExperienceBuffer()
        # bfr.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.rseed) + '/' + 'traj')
        low_bfr = ExperienceBuffer()
        if args.reverse:
            low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                            str(args.noptions) + 'rev_' +
                            str(args.ffuncnunit) + '_' + str(args.rseed) +
                            '/' + 'low_traj')
        else:
            low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                            str(args.noptions) + '_' + str(args.ffuncnunit) +
                            '_' + str(args.rseed) + '/' + 'low_traj')
    else:
        _, low_bfr = sample_option_trajectories(mdp,
                                                args,
                                                noptions=args.noptions)

        print('sampled')
    # TODO: Print a list of states

    samples = low_bfr.buffer

    size = low_bfr.size()

    print('size=', size)

    trajectories = []

    cur_o = None
    for i in range(size):
        # TODO: something wrong is happening in the trajectory. Why?
        s, a, r, s2, t, o = samples[i][0], samples[i][1], samples[i][
            2], samples[i][3], samples[i][4], samples[i][5]

        # assert(t is False)

        # print('o=', o, ', t=', t)

        if cur_o == args.noptions:
            if o == args.noptions and not t and i != size - 1:
                traj.append(s)
            else:
                # traj.append(s2)
                if args.tasktype == 'pinball':
                    t = [s for s in traj if s.x != 0.2 or s.y != 0.2
                         ]  # TODO: hack to remove the init state.
                else:
                    t = traj
                # for i, s in enumerate(t):
                # if 0.01466 <= s.data[0] and s.data[0] <= 0.01467:
                #     t.remove(s)
                #     # break
                # print(s.data[0])
                trajectories.append((i, t))

                cur_o = 0
                traj = []

                # TODO: what is the best way to print these figures out?
                # break
        else:
            if o == args.noptions:
                traj = [s]
                cur_o = args.noptions

    for traj in trajectories:
        i = traj[0]
        t = traj[1]
        print(i, ' traj length=', len(t))
        if args.reverse:
            plot_trajectory(t,
                            mdp,
                            args,
                            filename=args.basedir + '/vis/' + args.task +
                            'option' + str(args.noptions) + 'rev_' +
                            str(args.ffuncnunit) + '_' + str(args.rseed) +
                            '/' + 'traj' + str(i) + '.pdf')
        else:
            plot_trajectory(t,
                            mdp,
                            args,
                            filename=args.basedir + '/vis/' + args.task +
                            'option' + str(args.noptions) + '_' +
                            str(args.ffuncnunit) + '_' + str(args.rseed) +
                            '/' + 'traj' + str(i) + '.pdf')
예제 #10
0
def main(open_plot=True):
    # TODO: Accept set of options and generate a new option based on them.

    args = arguments()

    np.random.seed(1234)
    # tf.set_random_seed(args.rseed)
    # tf.set_random_seed(5678)
    # tf.set_random_seed(5408)
    tf.set_random_seed(2345)

    print('tasktype=', args.tasktype)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    # We generate k-th option based on the previous k-1 options.

    if args.restoretraj:
        bfr = ExperienceBuffer()
        if args.reverse:
            print('restoring buffer from ' + './vis/' + args.task + 'option' +
                  str(args.noptions - 1) + 'rev_' + str(args.ffuncnunit) +
                  '_' + str(args.rseed) + '/' + 'traj')
            bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                        str(args.noptions - 1) + 'rev_' +
                        str(args.ffuncnunit) + '_' + str(args.rseed) + '/' +
                        'traj')
        else:
            print('restoring buffer from ' + './vis/' + args.task + 'option' +
                  str(args.noptions - 1) + '_' + str(args.ffuncnunit) + '_' +
                  str(args.rseed) + '/' + 'traj')
            bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                        str(args.noptions - 1) + '_' + str(args.ffuncnunit) +
                        '_' + str(args.rseed) + '/' + 'traj')
        bfr_size = bfr.size()
        print('bfr_size=', bfr_size)  # TODO: parameter?
    else:
        bfr, _ = sample_option_trajectories(mdp,
                                            args,
                                            noptions=args.noptions - 1)
        bfr_size = bfr.size()
        print('bfr_size=', bfr_size)

    # TODO: In graph theory, inserting an edge results in significant change to the topology.
    #       However, seems adding just one transition sample to the NN does not change it too much.
    #       Can we tackle this problem other than sampling the trajectories again?

    op = OptionWrapper(sess=None,
                       experience_buffer=bfr,
                       option_b_size=min(32, bfr_size),
                       sp_training_steps=args.sptrainingstep,
                       obs_dim=state_dim,
                       obs_bound=state_bound,
                       num_actions=num_actions,
                       action_dim=action_dim,
                       action_bound=action_bound,
                       low_method=args.lowmethod,
                       f_func=args.ffunction,
                       n_units=args.ffuncnunit,
                       init_all=args.initall,
                       restore=None,
                       reversed_dir=args.reverse,
                       name='option' + str(args.noptions) + '_' +
                       str(args.ffuncnunit) + '_' + str(args.rseed))

    # if args.train:
    #     op.train(bfr, batch_size=args.snepisodes * args.snsteps)

    if args.reverse:
        filename = args.basedir + '/vis/' + args.task + 'option' + str(
            args.noptions) + 'rev_' + str(args.ffuncnunit) + "_" + str(
                args.rseed)
    else:
        filename = args.basedir + '/vis/' + args.task + 'option' + str(
            args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)

    op.save(filename)
예제 #11
0
def main(open_plot=True):
    args = arguments()

    # Random seeds
    np.random.seed(args.rseed)
    tf.set_random_seed(args.rseed)

    print('tasktype=', args.tasktype)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    oagent = OptionAgent(sess=None,
                         obs_dim=state_dim,
                         obs_bound=state_bound,
                         num_actions=num_actions,
                         action_dim=action_dim,
                         action_bound=action_bound,
                         num_options=args.noptions,
                         init_all=args.initall,
                         high_method=args.highmethod,
                         low_method=args.lowmethod,
                         f_func=args.ffunction,
                         batch_size=args.batchsize,
                         buffer_size=args.buffersize,
                         option_batch_size=args.obatchsize,
                         option_buffer_size=args.obuffersize,
                         option_freq=args.ofreq,
                         option_min_steps=args.ominsteps,
                         name=str(args.noptions) + 'op-initall')

    agents = []
    agents.append(oagent)

    if args.base:
        base = OptionAgent(sess=None,
                           obs_dim=state_dim,
                           obs_bound=state_bound,
                           num_actions=num_actions,
                           action_dim=action_dim,
                           action_bound=action_bound,
                           num_options=1,
                           high_method=args.highmethod,
                           low_method=args.lowmethod,
                           f_func=args.ffunction,
                           batch_size=args.batchsize,
                           buffer_size=args.buffersize,
                           option_batch_size=1,
                           option_buffer_size=2,
                           init_all=args.initall,
                           name='base')
        agents.append(base)

    mdp.reset()

    run_agents_on_mdp(agents,
                      mdp,
                      episodes=args.nepisodes,
                      steps=args.nsteps,
                      instances=args.ninstances,
                      cumulative_plot=True)

    # TODO: Save the options learned by the agent
    options = oagent.generated_options[1]
    print('options=', options)
    for i, op in enumerate(options):
        if i == 0:
            continue
        op.save('./vis/' + args.task + 'online-option' + str(i) + '_' +
                str(args.rseed))
예제 #12
0
def main(open_plot=True):
    rseed = 1234  # 5678
    # Random seeds
    np.random.seed(1234)
    tf.set_random_seed(rseed)

    parser = argparse.ArgumentParser()

    # pinball files = pinball_box.cfg  pinball_empty.cfg  pinball_hard_single.cfg  pinball_medium.cfg  pinball_simple_single.cfg

    # Parameters for the task
    parser.add_argument('--tasktype', type=str, default='pinball')
    parser.add_argument('--task', type=str, default='pinball_empty.cfg')
    parser.add_argument('--base', action='store_true')

    parser.add_argument('--nepisodes', type=int, default=100)
    parser.add_argument('--nsteps', type=int, default=200)

    parser.add_argument('--buffersize', type=int, default=512)
    parser.add_argument('--batchsize', type=int, default=128)
    parser.add_argument('--obuffersize', type=int, default=512)
    parser.add_argument('--obatchsize', type=int, default=128)

    parser.add_argument('--highmethod', type=str, default='linear')
    parser.add_argument('--lowmethod', type=str, default='linear')
    parser.add_argument('--ffunction', type=str, default='fourier')

    # Parameters for the Agent
    parser.add_argument(
        '--noptions', type=int,
        default=5)  # (5 = 1 for primitive actions and 4 covering options).

    # Visualization
    parser.add_argument('--render', action='store_true')

    args = parser.parse_args()

    print('tasktype=', args.tasktype)

    state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    ##############################################
    # Generate the f-function
    ##############################################
    if args.snepisodes == 0:
        op = OptionWrapper(sess=None,
                           experience_buffer=None,
                           obs_dim=state_dim,
                           obs_bound=mdp.bounds(),
                           num_actions=num_actions,
                           action_dim=action_dim,
                           action_bound=action_bound,
                           low_method=args.lowmethod,
                           f_func=args.ffunction,
                           restore=True,
                           name='option' + str(rseed))
        op.restore('./vis/' + args.tasktype + 'option' + str(rseed))
    else:
        bfr = sample_trajectories(mdp, args)
        buf_size = args.snepisodes * args.snsteps
        op = OptionWrapper(sess=None,
                           experience_buffer=bfr,
                           option_b_size=buf_size,
                           obs_dim=state_dim,
                           obs_bound=mdp.bounds(),
                           num_actions=num_actions,
                           action_dim=action_dim,
                           action_bound=action_bound,
                           low_method=args.lowmethod,
                           f_func=args.ffunction,
                           name='option' + str(rseed))

        ##############################################
        # Train the option policy
        ##############################################
        buffer_size = args.snsteps * args.snepisodes
        op.train(bfr, batch_size=buffer_size)

        op.save('./vis/' + args.tasktype + 'option' + str(rseed))
        exit(0)
    # plot_option(op, './vis/' + 'option' + str(rseed) + '/' + 'vis.pdf')
    ##############################################
    # Evaluate the generated option
    ##############################################
    # print('op.f_function', op.f_function)
    # oagent = OptionAgent(sess=None, obs_dim=state_dim, num_actions=len(mdp.get_actions()), num_options=2, batch_size=1, buffer_size=2, option_batch_size=1, option_buffer_size=2, name='1op')
    # oagent = OptionAgent(sess=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=2, init_all=True, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=1, buffer_size=2, option_batch_size=1, option_buffer_size=2, name='1op-initall')
    # oagent.add_option(op)

    base = OptionAgent(sess=None,
                       obs_dim=state_dim,
                       obs_bound=mdp.bounds(),
                       num_actions=num_actions,
                       action_dim=action_dim,
                       action_bound=action_bound,
                       num_options=1,
                       high_method=args.highmethod,
                       low_method=args.lowmethod,
                       f_func=args.ffunction,
                       batch_size=args.batchsize,
                       buffer_size=args.buffersize,
                       option_batch_size=args.obatchsize,
                       option_buffer_size=args.obuffersize,
                       name='base')
    ddpg = DDPGAgent(sess=None,
                     obs_dim=state_dim,
                     action_dim=action_dim,
                     action_bound=action_bound,
                     buffer_size=args.buffersize,
                     batch_size=args.batchsize,
                     name='ddpg')

    agents = []
    # agents.append(oagent)
    agents.append(base)
    agents.append(ddpg)

    mdp.reset()

    run_agents_on_mdp(agents,
                      mdp,
                      episodes=args.nepisodes,
                      steps=args.nsteps,
                      instances=args.ninstances,
                      cumulative_plot=True)
예제 #13
0
def main(open_plot=True):

    args = arguments()

    # Random seeds
    np.random.seed(1234)
    tf.set_random_seed(args.rseed)

    print('task=', args.task)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    # TODO: Train an option using the trajectories sampled by itself.

    op = OptionWrapper(sess=None,
                       experience_buffer=None,
                       obs_dim=state_dim,
                       obs_bound=mdp.bounds(),
                       num_actions=num_actions,
                       action_dim=action_dim,
                       action_bound=action_bound,
                       low_method=args.lowmethod,
                       f_func=args.ffunction,
                       n_units=args.ffuncnunit,
                       restore=True,
                       init_all=args.initall,
                       reversed_dir=args.reverse,
                       name='option' + str(args.noptions) + '_' +
                       str(args.ffuncnunit) + '_' + str(args.rseed))

    # if args.reverse:
    #     op.restore('./vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed))
    # else:
    op.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' +
               str(args.ffuncnunit) + '_' + str(args.rseed))

    op.reversed_dir = args.reverse

    # TODO: Shouldn't we train the policy based on its own sample frequency?
    if args.restoretraj:
        if args.trajdir == '__default':
            args.trajdir = './vis/' + args.task + 'option' + str(
                args.noptions - 1) + '_' + str(args.ffuncnunit) + '_' + str(
                    args.rseed) + '/' + 'low_traj'

        print('restoring buffer from ' + args.trajdir)
        bfr = ExperienceBuffer()
        # if args.reverse:
        #     bfr.restore('./vis/' + args.task + 'option' + str(args.noptions - 1) + 'rev_' + str(args.rseed) + '/' + 'low_traj')
        # else:
        bfr.restore(args.trajdir)

        bfr_size = bfr.size()
        print('bfr_size=', bfr_size)  # TODO: parameter?
    else:
        _, bfr = sample_option_trajectories(mdp,
                                            args,
                                            noptions=args.noptions - 1)
        bfr_size = bfr.size()
        print('bfr_size=', bfr_size)

    _, _, r, _, _ = bfr.sample(32)
    print('rewards=', r)

    for _ in range(args.sptrainingstep):
        op.train(bfr, batch_size=min(128, bfr_size))

    if args.reverse:
        op.save('./vis/' + args.task + 'option' + str(args.noptions) + 'rev_' +
                str(args.ffuncnunit) + '_' + str(args.rseed))
    else:
        op.save('./vis/' + args.task + 'option' + str(args.noptions) + '_' +
                str(args.ffuncnunit) + '_' + str(args.rseed))
예제 #14
0
def main(open_plot=True):

    args = arguments()

    # Random seeds
    np.random.seed(args.rseed)
    tf.set_random_seed(args.rseed)

    print('task=', args.task)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    #################################
    # 1. Retrieve trajectories
    #################################
    if args.trajdir == '__default':
        prefix = '.'
    else:
        prefix = args.trajdir

    if args.exp == "generate" or args.exp == "train":
        pathnop = str(args.noptions - 1)
    else:
        pathnop = str(args.noptions)

    # if args.reverse:
    #     dirop = 'rev_'
    # else:
    #     dirop = '_'
    dirop = '_'

    # pathdir: directory for the trajectories
    # opdir  : directory for the option
    pathdir = prefix + '/vis/' + args.task + 'option' + pathnop + dirop + str(
        args.ffuncnunit) + '_' + str(args.rseed)

    opdir = prefix + '/vis/' + args.task + 'option' + str(
        args.noptions) + dirop + str(args.ffuncnunit) + '_' + str(args.rseed)

    if args.saveimage:
        lowbfr_path = pathdir + '/low_traj_img'
        bfr_path = pathdir + '/traj_img'
    elif args.savecmp:
        lowbfr_path = pathdir + '/low_traj_sa'
        bfr_path = pathdir + '/low_traj_sa'
    else:
        lowbfr_path = pathdir + '/low_traj'
        bfr_path = pathdir + '/traj'

    bfrexp = ["vistraj", "visterm", "visvis", "visfval"]
    bfrexp_ = bfrexp + ["train"]
    if args.exp == "generate":
        print('restoring', bfr_path)
        bfr = ExperienceBuffer()
        if args.savecmp:
            bfr.restore_sa(bfr_path)
        else:
            bfr.restore(bfr_path)
    elif args.exp in bfrexp_:
        if args.exp in bfrexp and args.reverse:
            lowbfr_path = lowbfr_path + 'rev'
        print('restoring', lowbfr_path)
        low_bfr = ExperienceBuffer()
        if args.savecmp:
            low_bfr.restore_sao(lowbfr_path)
        else:
            low_bfr.restore(lowbfr_path)

        mix_traj = False
        if mix_traj:
            low_bfr2 = ExperienceBuffer()
            opdir2 = prefix + '/vis/' + args.task + 'option0' + dirop + str(
                args.ffuncnunit) + '_' + str(args.rseed)
        #     # TODO: savecmp not supported
        #     low_bfr2.restore(opdir2 + '/low_traj')
    else:
        print('No buffer retrieved')

    #################################
    # 2. Retrieve options
    #################################
    # Experiments which require 1 option to retrieve
    oneopexp = ["visop", "visfval", "train"]
    # Multilpe options to retrieve (But it is retrieved inside the util.py, so let's forget it here)
    # multiopexp = ["sample"]

    if args.exp in oneopexp:
        op = CoveringOption(sess=None,
                            experience_buffer=None,
                            obs_dim=state_dim,
                            obs_bound=mdp.bounds(),
                            num_actions=num_actions,
                            action_dim=action_dim,
                            action_bound=action_bound,
                            low_method=args.lowmethod,
                            f_func=args.ffunction,
                            n_units=args.ffuncnunit,
                            init_all=args.initall,
                            init_around_goal=args.init_around_goal,
                            init_dist=args.init_dist,
                            term_dist=args.term_dist,
                            restore=True,
                            name='option' + str(args.noptions) + '_' +
                            str(args.ffuncnunit) + '_' + str(args.rseed))

        op.restore(opdir)
    else:
        print('No option retrieved')

    #################################
    # 3. Run experiments
    #################################

    if args.exp == 'sample':
        print('sample')
        bfr, low_bfr = sample_option_trajectories(mdp,
                                                  args,
                                                  noptions=args.noptions)
    elif args.exp == 'generate':
        print('generate_option')
        print('buffersize = ', bfr.size())
        # TODO: option_b_size is the batch size for training f-function.
        op = CoveringOption(sess=None,
                            experience_buffer=bfr,
                            option_b_size=32,
                            sp_training_steps=args.sptrainingstep,
                            obs_dim=state_dim,
                            obs_bound=state_bound,
                            num_actions=num_actions,
                            action_dim=action_dim,
                            action_bound=action_bound,
                            low_method=args.lowmethod,
                            f_func=args.ffunction,
                            n_units=args.ffuncnunit,
                            init_all=args.initall,
                            reversed_dir=args.reverse,
                            init_around_goal=args.init_around_goal,
                            init_dist=args.init_dist,
                            term_dist=args.term_dist,
                            restore=None,
                            name='option' + str(args.noptions) + '_' +
                            str(args.ffuncnunit) + '_' + str(args.rseed))

    elif args.exp == 'train':
        print('train_option')
        op.reversed_dir = args.reverse
        _, _, r, _, _ = low_bfr.sample(32)
        print('background rewards=', r)
        for _ in range(args.sptrainingstep):
            op.train(low_bfr, batch_size=min(args.batchsize, low_bfr.size()))
    elif args.exp == 'evaloff' or args.exp == 'evalon':
        print('evaloff')
        agent_name = str(args.noptions) + 'options'
        if args.exp == 'evalon':
            agent_name = agent_name + '-online'

        if args.random_agent:
            oagent = GenerateRandomAgent(num_actions, action_dim, action_bound)
        else:
            oagent = OptionAgent(sess=None,
                                 obs_dim=state_dim,
                                 obs_bound=state_bound,
                                 num_actions=num_actions,
                                 action_dim=action_dim,
                                 action_bound=action_bound,
                                 num_options=1 + args.noptions,
                                 high_method=args.highmethod,
                                 low_method=args.lowmethod,
                                 f_func=args.ffunction,
                                 batch_size=args.batchsize,
                                 buffer_size=args.buffersize,
                                 low_update_freq=args.lowupdatefreq,
                                 option_batch_size=args.obatchsize,
                                 option_buffer_size=args.obuffersize,
                                 high_update_freq=args.highupdatefreq,
                                 init_all=args.initall,
                                 init_around_goal=args.init_around_goal,
                                 init_dist=args.init_dist,
                                 term_dist=args.term_dist,
                                 name=agent_name)
            oagent.reset()

        if args.exp == 'evaloff':
            for nop in range(1, args.noptions + 1):
                op = CoveringOption(sess=None,
                                    experience_buffer=None,
                                    obs_dim=state_dim,
                                    obs_bound=mdp.bounds(),
                                    num_actions=num_actions,
                                    action_dim=action_dim,
                                    action_bound=action_bound,
                                    low_method=args.lowmethod,
                                    f_func=args.ffunction,
                                    init_all=args.initall,
                                    init_around_goal=args.init_around_goal,
                                    init_dist=args.init_dist,
                                    term_dist=args.term_dist,
                                    restore=True,
                                    name='option' + str(nop) + '_' +
                                    str(args.ffuncnunit) + '_' +
                                    str(args.rseed))

                if args.reverse:
                    opdir = prefix + '/vis/' + args.task + 'option' + str(
                        nop) + 'rev_' + str(args.ffuncnunit) + '_' + str(
                            args.rseed)
                else:
                    opdir = prefix + '/vis/' + args.task + 'option' + str(
                        nop) + '_' + str(args.ffuncnunit) + '_' + str(
                            args.rseed)

                op.restore(opdir)
                print('restored option', opdir)
                oagent.add_option(op)
        else:
            print('evalon')
        mdp.reset()
        run_agents_on_mdp([oagent],
                          mdp,
                          episodes=args.nepisodes,
                          steps=args.nsteps,
                          instances=args.ninstances,
                          cumulative_plot=True,
                          verbose=args.verbose)
    else:
        print('No experiments run')

    #################################
    # 4. Plot figures
    #################################
    if args.exp == 'visop':
        plot_op(op, args, mdp, state_bound, opdir + '/eigenfunc.pdf')
    elif args.exp == 'vistraj' or args.exp == 'visterm':
        print(args.exp)
        samples = low_bfr.buffer
        size = low_bfr.size()
        trajectories = []
        cur_o = None
        for i in range(size):
            s, _, _, _, t, o = samples[i][0], samples[i][1], samples[i][
                2], samples[i][3], samples[i][4], samples[i][5]
            if cur_o == args.noptions:
                if o == args.noptions and not t and i != size - 1:
                    traj.append(s)
                else:
                    # traj.append(s2)
                    # if args.tasktype == 'pinball':
                    #     t = [s for s in traj if s.x != 0.2 or s.y != 0.2] # TODO: hack to remove the init state.
                    # else:
                    #     t = traj
                    if len(traj) > 10:
                        trajectories.append((i, traj))

                    cur_o = 0
                    traj = []
            else:
                if o == args.noptions:
                    traj = [s]
                    cur_o = args.noptions

        if len(trajectories) == 0:
            print('no trajectories sampled')

        if args.exp == 'visterm':
            terms = [traj[-1] for traj in trajectories]
            terms = terms[0:min(len(terms), 100)]
            # print('terms=', type(terms))
            print('#terms=', len(terms))
            if args.reverse:
                plot_terms(terms,
                           mdp,
                           args,
                           filename=pathdir + '/' + 'terms' + 'rev')
            else:
                plot_terms(terms, mdp, args, filename=pathdir + '/' + 'terms')
        else:
            t = trajectories[1][1]
            plot_traj(t, mdp, args, filename=pathdir + '/' + 'traj' + str(1))

    elif args.exp == 'visvis':
        print('visvis')
        samples = low_bfr.buffer
        traj = [samples[i][0] for i in range(low_bfr.size())]
        if mix_traj:

            samples2 = low_bfr2.buffer
            traj2 = [
                samples2[i][0]
                for i in range(int(min(low_bfr2.size() / 2,
                                       len(traj) / 2)))
            ]

            traj = traj[:int(len(traj) / 2)] + traj2
        plot_vis(traj, args, mdp, pathdir + '/visitation')
    elif args.exp == 'visfval':
        print('visfval')
    else:
        print('No plots')

    #################################
    # 5. Save the results
    #################################
    if args.exp == 'sample':
        print('save sample')
        if args.reverse:
            dirop = "rev"
        else:
            dirop = ""

        if args.saveimage:
            bfr.save(pathdir + '/traj_img' + dirop)
            low_bfr.save(pathdir + '/low_traj_img' + dirop)
        elif args.savecmp:
            bfr.save_sa(pathdir + '/traj_sa' + dirop)
            low_bfr.save_sao(pathdir + '/low_traj_sa' + dirop)
        else:
            bfr.save(pathdir + '/traj' + dirop)
            low_bfr.save(pathdir + '/low_traj' + dirop)

    elif args.exp == 'evaloff' or args.exp == 'evalon':
        print('save', args.exp)
        options = oagent.options
        for nop in range(1, len(options)):
            opdir = prefix + '/vis/' + args.task + 'option' + str(
                nop) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)
            if args.exp == 'evalon':
                opdir = opdir + '_online'

            options[nop].save(opdir + '_trained')
        oagent.option_buffer.save(pathdir + '_trained' + '/' + 'traj')
        oagent.experience_buffer.save(pathdir + '_trained' + '/' + 'low_traj')
    elif args.exp == 'generate':
        print('save generate')
        op.save(opdir)
    elif args.exp == 'train':
        print('save train')
        if args.reverse:
            op.save(opdir, rev=True)
        else:
            op.save(opdir)
    else:
        print('No save')