def save(args): mdp, obs_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) agent = DiaynAgent(sess=None, obs_dim=obs_dim, num_actions=num_actions, num_options=args.noptions, action_dim=action_dim, action_bound=action_bound, batch_size=32, update_freq=32, alpha=1.0) agent.set_diversity(True) run_agents_on_mdp([agent], mdp, episodes=args.snepisodes, steps=args.snsteps, instances=1, cumulative_plot=True) if args.trajdir == '__default': prefix = '.' else: prefix = args.trajdir agent.save(directory=prefix + '/vis' + '/' + str(args.task) + 'option' + str(args.noptions) + 'diayn', name='diayn-pretrain')
def main(open_plot=True): # TODO: Refactor and combine visualize_visitation, visualize_option, visualize_option_trajectory? # Plot the visitation statistics args = arguments() # Random seeds np.random.seed(1234) tf.set_random_seed(args.rseed) print('task=', args.task) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) if args.restoretraj: # bfr = ExperienceBuffer() # bfr.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.rseed) + '/' + 'traj') low_bfr = ExperienceBuffer() if args.reverse: low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') else: low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') else: _, low_bfr = sample_option_trajectories(mdp, args, noptions=args.noptions) # TODO: Print a list of states samples = low_bfr.buffer size = low_bfr.size() cur_o = None traj = [samples[i][0] for i in range(size)] if args.reverse: plot_visitation(traj, mdp, args, filename=args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'visitations' + '.pdf') else: plot_visitation(traj, mdp, args, filename=args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'visitations' + '.pdf')
def main(open_plot=True): args = arguments() # Random seeds np.random.seed(1234) tf.set_random_seed(args.rseed) print('task=', args.task) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) if args.restoretraj: # bfr = ExperienceBuffer() # bfr.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.rseed) + '/' + 'traj') low_bfr = ExperienceBuffer() if args.reverse: low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') else: low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') else: _, low_bfr = sample_option_trajectories(mdp, args, noptions=args.noptions) print('sampled') # TODO: Print a list of states size = low_bfr.size() op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, restore=True, name='option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) op.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) filename = args.basedir + '/vis/' + args.task + 'option' + str( args.noptions) + '_' + str(args.ffuncnunit) + '_' + str( args.rseed) + '/' + 'fvalues.pdf' plot_fvalue(low_bfr, op, filename=filename)
def main(): args = arguments() mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) bfr, low_bfr = sample_option_trajectories(mdp, args, noptions=args.noptions) # TODO: Trajectories are generated using noptions-1 options. if args.reverse: bfr.save(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj') low_bfr.save(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') else: bfr.save(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj') low_bfr.save(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') print('bfr size=', bfr.size()) print('lbfr size=', low_bfr.size()) if args.task == 'PointMaze-v0': s, a, r, s, t = low_bfr.sample(20) for state in s: # print('s=', state) # TODO: how do we get the X, Y coordinates of the agent? print('x,y=', state.data[0], state.data[1]) if args.task == 'MontezumaRevenge-ram-v0': s, a, r, s, t = low_bfr.sample(20) def getByte(ram, row, col): row = int(row, 16) - 8 col = int(col, 16) return ram[row * 16 + col] for state in s: x = int(getByte(state.data, 'a', 'a')) y = int(getByte(state.data, 'a', 'b')) x_img = int(210.0 * (float(x) - 1) / float((9 * 16 + 8) - 1)) y_img = int(160.0 * (float(y) - (8 * 16 + 6)) / float((15 * 16 + 15) - (8 * 16 + 6))) print('(ram) x, y =', x, y) print('(img) x, y =', x_img, y_img)
def restore(args): mdp, obs_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) rst = DiaynAgent(sess=None, obs_dim=obs_dim, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=args.noptions, batch_size=1, update_freq=1, alpha=1.0) rst.restore(directory=prefix + '/vis' + '/' + str(args.task) + 'option' + str(args.noptions) + 'diayn', name='diayn-pretrain') rst.set_diversity(False) oagent = OptionAgent(sess=None, obs_dim=obs_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=1 + args.noptions, init_all=args.initall, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=args.batchsize, buffer_size=args.buffersize, low_update_freq=args.lowupdatefreq, option_batch_size=args.obatchsize, option_buffer_size=args.obuffersize, high_update_freq=args.highupdatefreq, name='diayn' + str(args.noptions)) for i in range(args.noptions): op = DiaynOption(rst, i, args.termprob) oagent.add_option(op) run_agents_on_mdp([oagent], mdp, episodes=args.nepisodes, steps=args.nsteps, instances=args.ninstances, cumulative_plot=True)
def main(open_plot=True): args = arguments() # Random seeds np.random.seed(1234) tf.set_random_seed(args.rseed) print('task=', args.task) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(args) if args.online: # TODO: Think how to solve the restoration for batch normalization. op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, restore=True, name='online-option' + str(args.noptions) + '_' + str(args.ffuncnunit)) op.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) plot_eigenfunction(op, args, xind=0, yind=1, filename=args.basedir + '/vis/' + args.task + 'online-option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '/' + 'eigenfunc.pdf') else: op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, restore=True, name='option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) op.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) plot_eigenfunction(op, args, xind=0, yind=1, filename=args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'eigenfunc.pdf')
def plot_eigenfunction(op, args, xind=0, yind=1, filename='visualize_ef.pdf'): # Pinball mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(args) n_samples = 2000 low_bound = state_bound[0] up_bound = state_bound[1] if args.task == 'AntMaze-v0' or args.task == 'PointMaze-v0': low_bound[xind] = 0.0 low_bound[yind] = 0.0 up_bound[xind] = 8.0 * 3.0 up_bound[yind] = 8.0 * 3.0 if args.tasktype == 'atari': low_bound[xind] = 0.0 low_bound[yind] = 0.0 up_bound[xind] = 160.0 up_bound[yind] = 210.0 xs = [] ys = [] fs = [] # if np.isinf(low_bound).any() or np.isinf(up_bound).any(): # bfr = sample_option_trajectories(mdp, args, noptions=0) # # ss, _, _, _, _ = bfr.sample(n_samples) # # max_x = float('-inf') # min_x = float('inf') # max_y = float('-inf') # min_y = float('inf') # for i in range(n_samples): # x = ss[i].data[xind] # y = ss[i].data[yind] # max_x = max(x, max_x) # min_x = min(x, min_x) # max_y = max(y, max_y) # min_y = min(y, min_y) # low_bound[xind] = min_x # up_bound[xind] = max_x # low_bound[yind] = min_y # up_bound[yind] = max_y # TODO: Implement a script to plot the f-value of the states # visited by the agent instead of sampling uniform randomly. if args.restoretraj: print('restoring buffer from ' + './vis/' + args.task + 'option' + str(args.noptions - 1) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj') bfr = ExperienceBuffer() bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions - 1) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj') bfr_size = bfr.size() print('bfr_size=', bfr_size) # TODO: parameter? samples, _, _, _, _ = bfr.sample(n_samples) # samples = [bfr.buffer[i][0] for i in range(min(bfr.size(), n_samples))] if args.task == 'MontezumaRevenge-ram-v0': feature = Monte() xs = [feature.feature(s, 0)[0] for s in samples] ys = [feature.feature(s, 0)[1] for s in samples] elif args.ffunction == 'nns': feature = Subset(state_dim, [0, 1]) xs = [feature.feature(s, 0)[0] for s in samples] ys = [feature.feature(s, 0)[1] for s in samples] else: xs = [s.data[xind] for s in samples] ys = [s.data[yind] for s in samples] else: xs = [random.uniform(low_bound[xind], up_bound[xind]) for _ in range(n_samples)] ys = [random.uniform(low_bound[yind], up_bound[yind]) for _ in range(n_samples)] fs = [] for i in range(len(xs)): if args.task == 'MontezumaRevenge-ram-v0': obs = np.array([xs[i], ys[i]]) obs = np.reshape(obs, (1, 2)) f_value = op.f_function.f_from_features(obs)[0][0] elif args.ffunction == 'nns': obs = np.array([xs[i], ys[i]]) obs = np.reshape(obs, (1, 2)) f_value = op.f_function.f_from_features(obs)[0][0] else: s = mdp.get_init_state() s.data[xind] = xs[i] s.data[yind] = ys[i] f_value = op.f_function(s)[0][0] fs.append(f_value) # TODO: What is the best colormap for all people (including color blinds?) but still appealing for majority? # bwr looks useful, but may be misleading?. cmap = matplotlib.cm.get_cmap('plasma') normalize = matplotlib.colors.Normalize(vmin=min(fs), vmax=max(fs)) colors = [cmap(normalize(value)) for value in fs] # colors_np = np.asarray(colors) fig, ax = plt.subplots(figsize=(8, 6)) ax.scatter(x=xs, y=ys, c=colors) cax, _ = matplotlib.colorbar.make_axes(ax) cbar = matplotlib.colorbar.ColorbarBase(cax, cmap=cmap, norm=normalize) term_th = op.lower_th cax.plot([0, 1], [term_th] * 2, 'k') term, nonterm = 0, 0 for f in fs: if f < term_th: term += 1 else: nonterm += 1 print(term, 'terms', nonterm, 'nonterms') # TODO: Only for pinball domains. What to do for MuJoCo? # Obstacles if args.tasktype == 'pinball': for obs in mdp.domain.environment.obstacles: point_list = obs.points xlist = [] ylist = [] for p in point_list: xlist.append(p[0]) ylist.append(p[1]) ax.fill(xlist, ylist, 'k') elif args.task == 'PointMaze-v0' or args.task == 'AntMaze-v0': # TODO: (x,y) coordinates start at 0, 0. # How is the coordinates signed? maze = [[1, 1, 1, 1, 1], [1, 0, 0, 0, 1], [1, 1, 1, 0, 1], [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]] scale = 8.0 for y in range(5): for x in range(5): if maze[y][x] == 1: # We decrement x and y because the (0, 0)-coordinate is set at (1, 1) position in the maze. xbase, ybase = scale * (x - 1), scale * (y - 1) xlist = [xbase, xbase + scale, xbase + scale, xbase] ylist = [ybase, ybase, ybase + scale, ybase + scale] ax.fill(xlist, ylist, 'k') elif args.task == 'MontezumaRevenge-ram-v0': # TODO: Show the background of the Monte? img = imread('./montezuma.jpg') ax.imshow(img, zorder=0, extent=[0, 160, 0, 210]) plt.savefig(filename) plt.close()
def main(open_plot=True): # TODO: Accept a set of options instead of just one args = arguments() # Random seeds np.random.seed(args.rseed) tf.set_random_seed(args.rseed) print('tasktype=', args.tasktype) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) oagent = OptionAgent(sess=None, obs_dim=state_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=1 + args.noptions, init_all=args.initall, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=args.batchsize, buffer_size=args.buffersize, low_update_freq=args.lowupdatefreq, option_batch_size=args.obatchsize, option_buffer_size=args.obuffersize, high_update_freq=args.highupdatefreq, name='op') oagent.reset() for nop in range(1, args.noptions + 1): op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, init_all=args.initall, restore=True, name='option' + str(nop) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) if args.trajdir == '__default': if args.reverse: opdir = './vis/' + args.task + 'option' + str( nop) + 'rev_' + str(args.ffuncnunit) + '_' + str( args.rseed) else: opdir = './vis/' + args.task + 'option' + str(nop) + '_' + str( args.ffuncnunit) + '_' + str(args.rseed) else: # Only one option can be restored from nonstandard locations assert (args.noptions == 1) opdir = args.trajdir op.restore(opdir) print('restored option', opdir) # print('upper_th=', op.upper_th) oagent.add_option(op) agents = [] agents.append(oagent) if args.base: base = OptionAgent(sess=None, obs_dim=state_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=1, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=args.batchsize, buffer_size=args.buffersize, low_update_freq=args.lowupdatefreq, option_batch_size=1, option_buffer_size=2, high_update_freq=10000000, init_all=args.initall, name='base') agents.append(base) mdp.reset() # TODO: We need to count the number of times the agent reached the goal state. # Because from the cumulative rewards, it is hard to see if the agent is performing as intended. # Possible Solutions: (See the previous works first) # 1. Plot the number of times the agent reached the goal. # 2. Give a positive reward when it reached the goal run_agents_on_mdp(agents, mdp, episodes=args.nepisodes, steps=args.nsteps, instances=args.ninstances, cumulative_plot=True) options = oagent.options for nop in range(1, len(options)): if args.trajdir == '__default': opdir = './vis/' + args.task + 'option' + str(nop) + '_' + str( args.ffuncnunit) + '_' + str(args.rseed) else: assert (args.noptions == 1) opdir = args.trajdir # print('upper=', options[nop].upper_th) options[nop].save(opdir + '_trained') if args.trajdir == '__default': bufdir = './vis/' + args.task + 'option' + str( args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) else: bufdir = args.trajdir oagent.option_buffer.save(bufdir + '_trained' + '/' + 'traj') oagent.experience_buffer.save(bufdir + '_trained' + '/' + 'low_traj')
def main(open_plot=True): args = arguments() # Random seeds np.random.seed(1234) tf.set_random_seed(args.rseed) print('task=', args.task) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) if args.restoretraj: # bfr = ExperienceBuffer() # bfr.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.rseed) + '/' + 'traj') low_bfr = ExperienceBuffer() if args.reverse: low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') else: low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') else: _, low_bfr = sample_option_trajectories(mdp, args, noptions=args.noptions) print('sampled') # TODO: Print a list of states samples = low_bfr.buffer size = low_bfr.size() print('size=', size) trajectories = [] cur_o = None for i in range(size): # TODO: something wrong is happening in the trajectory. Why? s, a, r, s2, t, o = samples[i][0], samples[i][1], samples[i][ 2], samples[i][3], samples[i][4], samples[i][5] # assert(t is False) # print('o=', o, ', t=', t) if cur_o == args.noptions: if o == args.noptions and not t and i != size - 1: traj.append(s) else: # traj.append(s2) if args.tasktype == 'pinball': t = [s for s in traj if s.x != 0.2 or s.y != 0.2 ] # TODO: hack to remove the init state. else: t = traj # for i, s in enumerate(t): # if 0.01466 <= s.data[0] and s.data[0] <= 0.01467: # t.remove(s) # # break # print(s.data[0]) trajectories.append((i, t)) cur_o = 0 traj = [] # TODO: what is the best way to print these figures out? # break else: if o == args.noptions: traj = [s] cur_o = args.noptions for traj in trajectories: i = traj[0] t = traj[1] print(i, ' traj length=', len(t)) if args.reverse: plot_trajectory(t, mdp, args, filename=args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj' + str(i) + '.pdf') else: plot_trajectory(t, mdp, args, filename=args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj' + str(i) + '.pdf')
def main(open_plot=True): # TODO: Accept set of options and generate a new option based on them. args = arguments() np.random.seed(1234) # tf.set_random_seed(args.rseed) # tf.set_random_seed(5678) # tf.set_random_seed(5408) tf.set_random_seed(2345) print('tasktype=', args.tasktype) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) # We generate k-th option based on the previous k-1 options. if args.restoretraj: bfr = ExperienceBuffer() if args.reverse: print('restoring buffer from ' + './vis/' + args.task + 'option' + str(args.noptions - 1) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj') bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions - 1) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj') else: print('restoring buffer from ' + './vis/' + args.task + 'option' + str(args.noptions - 1) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj') bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions - 1) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj') bfr_size = bfr.size() print('bfr_size=', bfr_size) # TODO: parameter? else: bfr, _ = sample_option_trajectories(mdp, args, noptions=args.noptions - 1) bfr_size = bfr.size() print('bfr_size=', bfr_size) # TODO: In graph theory, inserting an edge results in significant change to the topology. # However, seems adding just one transition sample to the NN does not change it too much. # Can we tackle this problem other than sampling the trajectories again? op = OptionWrapper(sess=None, experience_buffer=bfr, option_b_size=min(32, bfr_size), sp_training_steps=args.sptrainingstep, obs_dim=state_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, restore=None, reversed_dir=args.reverse, name='option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) # if args.train: # op.train(bfr, batch_size=args.snepisodes * args.snsteps) if args.reverse: filename = args.basedir + '/vis/' + args.task + 'option' + str( args.noptions) + 'rev_' + str(args.ffuncnunit) + "_" + str( args.rseed) else: filename = args.basedir + '/vis/' + args.task + 'option' + str( args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) op.save(filename)
def main(open_plot=True): args = arguments() # Random seeds np.random.seed(args.rseed) tf.set_random_seed(args.rseed) print('tasktype=', args.tasktype) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) oagent = OptionAgent(sess=None, obs_dim=state_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=args.noptions, init_all=args.initall, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=args.batchsize, buffer_size=args.buffersize, option_batch_size=args.obatchsize, option_buffer_size=args.obuffersize, option_freq=args.ofreq, option_min_steps=args.ominsteps, name=str(args.noptions) + 'op-initall') agents = [] agents.append(oagent) if args.base: base = OptionAgent(sess=None, obs_dim=state_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=1, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=args.batchsize, buffer_size=args.buffersize, option_batch_size=1, option_buffer_size=2, init_all=args.initall, name='base') agents.append(base) mdp.reset() run_agents_on_mdp(agents, mdp, episodes=args.nepisodes, steps=args.nsteps, instances=args.ninstances, cumulative_plot=True) # TODO: Save the options learned by the agent options = oagent.generated_options[1] print('options=', options) for i, op in enumerate(options): if i == 0: continue op.save('./vis/' + args.task + 'online-option' + str(i) + '_' + str(args.rseed))
def main(open_plot=True): rseed = 1234 # 5678 # Random seeds np.random.seed(1234) tf.set_random_seed(rseed) parser = argparse.ArgumentParser() # pinball files = pinball_box.cfg pinball_empty.cfg pinball_hard_single.cfg pinball_medium.cfg pinball_simple_single.cfg # Parameters for the task parser.add_argument('--tasktype', type=str, default='pinball') parser.add_argument('--task', type=str, default='pinball_empty.cfg') parser.add_argument('--base', action='store_true') parser.add_argument('--nepisodes', type=int, default=100) parser.add_argument('--nsteps', type=int, default=200) parser.add_argument('--buffersize', type=int, default=512) parser.add_argument('--batchsize', type=int, default=128) parser.add_argument('--obuffersize', type=int, default=512) parser.add_argument('--obatchsize', type=int, default=128) parser.add_argument('--highmethod', type=str, default='linear') parser.add_argument('--lowmethod', type=str, default='linear') parser.add_argument('--ffunction', type=str, default='fourier') # Parameters for the Agent parser.add_argument( '--noptions', type=int, default=5) # (5 = 1 for primitive actions and 4 covering options). # Visualization parser.add_argument('--render', action='store_true') args = parser.parse_args() print('tasktype=', args.tasktype) state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) ############################################## # Generate the f-function ############################################## if args.snepisodes == 0: op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, restore=True, name='option' + str(rseed)) op.restore('./vis/' + args.tasktype + 'option' + str(rseed)) else: bfr = sample_trajectories(mdp, args) buf_size = args.snepisodes * args.snsteps op = OptionWrapper(sess=None, experience_buffer=bfr, option_b_size=buf_size, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, name='option' + str(rseed)) ############################################## # Train the option policy ############################################## buffer_size = args.snsteps * args.snepisodes op.train(bfr, batch_size=buffer_size) op.save('./vis/' + args.tasktype + 'option' + str(rseed)) exit(0) # plot_option(op, './vis/' + 'option' + str(rseed) + '/' + 'vis.pdf') ############################################## # Evaluate the generated option ############################################## # print('op.f_function', op.f_function) # oagent = OptionAgent(sess=None, obs_dim=state_dim, num_actions=len(mdp.get_actions()), num_options=2, batch_size=1, buffer_size=2, option_batch_size=1, option_buffer_size=2, name='1op') # oagent = OptionAgent(sess=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=2, init_all=True, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=1, buffer_size=2, option_batch_size=1, option_buffer_size=2, name='1op-initall') # oagent.add_option(op) base = OptionAgent(sess=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=1, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=args.batchsize, buffer_size=args.buffersize, option_batch_size=args.obatchsize, option_buffer_size=args.obuffersize, name='base') ddpg = DDPGAgent(sess=None, obs_dim=state_dim, action_dim=action_dim, action_bound=action_bound, buffer_size=args.buffersize, batch_size=args.batchsize, name='ddpg') agents = [] # agents.append(oagent) agents.append(base) agents.append(ddpg) mdp.reset() run_agents_on_mdp(agents, mdp, episodes=args.nepisodes, steps=args.nsteps, instances=args.ninstances, cumulative_plot=True)
def main(open_plot=True): args = arguments() # Random seeds np.random.seed(1234) tf.set_random_seed(args.rseed) print('task=', args.task) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) # TODO: Train an option using the trajectories sampled by itself. op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, restore=True, init_all=args.initall, reversed_dir=args.reverse, name='option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) # if args.reverse: # op.restore('./vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed)) # else: op.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) op.reversed_dir = args.reverse # TODO: Shouldn't we train the policy based on its own sample frequency? if args.restoretraj: if args.trajdir == '__default': args.trajdir = './vis/' + args.task + 'option' + str( args.noptions - 1) + '_' + str(args.ffuncnunit) + '_' + str( args.rseed) + '/' + 'low_traj' print('restoring buffer from ' + args.trajdir) bfr = ExperienceBuffer() # if args.reverse: # bfr.restore('./vis/' + args.task + 'option' + str(args.noptions - 1) + 'rev_' + str(args.rseed) + '/' + 'low_traj') # else: bfr.restore(args.trajdir) bfr_size = bfr.size() print('bfr_size=', bfr_size) # TODO: parameter? else: _, bfr = sample_option_trajectories(mdp, args, noptions=args.noptions - 1) bfr_size = bfr.size() print('bfr_size=', bfr_size) _, _, r, _, _ = bfr.sample(32) print('rewards=', r) for _ in range(args.sptrainingstep): op.train(bfr, batch_size=min(128, bfr_size)) if args.reverse: op.save('./vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed)) else: op.save('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed))
def main(open_plot=True): args = arguments() # Random seeds np.random.seed(args.rseed) tf.set_random_seed(args.rseed) print('task=', args.task) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) ################################# # 1. Retrieve trajectories ################################# if args.trajdir == '__default': prefix = '.' else: prefix = args.trajdir if args.exp == "generate" or args.exp == "train": pathnop = str(args.noptions - 1) else: pathnop = str(args.noptions) # if args.reverse: # dirop = 'rev_' # else: # dirop = '_' dirop = '_' # pathdir: directory for the trajectories # opdir : directory for the option pathdir = prefix + '/vis/' + args.task + 'option' + pathnop + dirop + str( args.ffuncnunit) + '_' + str(args.rseed) opdir = prefix + '/vis/' + args.task + 'option' + str( args.noptions) + dirop + str(args.ffuncnunit) + '_' + str(args.rseed) if args.saveimage: lowbfr_path = pathdir + '/low_traj_img' bfr_path = pathdir + '/traj_img' elif args.savecmp: lowbfr_path = pathdir + '/low_traj_sa' bfr_path = pathdir + '/low_traj_sa' else: lowbfr_path = pathdir + '/low_traj' bfr_path = pathdir + '/traj' bfrexp = ["vistraj", "visterm", "visvis", "visfval"] bfrexp_ = bfrexp + ["train"] if args.exp == "generate": print('restoring', bfr_path) bfr = ExperienceBuffer() if args.savecmp: bfr.restore_sa(bfr_path) else: bfr.restore(bfr_path) elif args.exp in bfrexp_: if args.exp in bfrexp and args.reverse: lowbfr_path = lowbfr_path + 'rev' print('restoring', lowbfr_path) low_bfr = ExperienceBuffer() if args.savecmp: low_bfr.restore_sao(lowbfr_path) else: low_bfr.restore(lowbfr_path) mix_traj = False if mix_traj: low_bfr2 = ExperienceBuffer() opdir2 = prefix + '/vis/' + args.task + 'option0' + dirop + str( args.ffuncnunit) + '_' + str(args.rseed) # # TODO: savecmp not supported # low_bfr2.restore(opdir2 + '/low_traj') else: print('No buffer retrieved') ################################# # 2. Retrieve options ################################# # Experiments which require 1 option to retrieve oneopexp = ["visop", "visfval", "train"] # Multilpe options to retrieve (But it is retrieved inside the util.py, so let's forget it here) # multiopexp = ["sample"] if args.exp in oneopexp: op = CoveringOption(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, init_around_goal=args.init_around_goal, init_dist=args.init_dist, term_dist=args.term_dist, restore=True, name='option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) op.restore(opdir) else: print('No option retrieved') ################################# # 3. Run experiments ################################# if args.exp == 'sample': print('sample') bfr, low_bfr = sample_option_trajectories(mdp, args, noptions=args.noptions) elif args.exp == 'generate': print('generate_option') print('buffersize = ', bfr.size()) # TODO: option_b_size is the batch size for training f-function. op = CoveringOption(sess=None, experience_buffer=bfr, option_b_size=32, sp_training_steps=args.sptrainingstep, obs_dim=state_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, reversed_dir=args.reverse, init_around_goal=args.init_around_goal, init_dist=args.init_dist, term_dist=args.term_dist, restore=None, name='option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) elif args.exp == 'train': print('train_option') op.reversed_dir = args.reverse _, _, r, _, _ = low_bfr.sample(32) print('background rewards=', r) for _ in range(args.sptrainingstep): op.train(low_bfr, batch_size=min(args.batchsize, low_bfr.size())) elif args.exp == 'evaloff' or args.exp == 'evalon': print('evaloff') agent_name = str(args.noptions) + 'options' if args.exp == 'evalon': agent_name = agent_name + '-online' if args.random_agent: oagent = GenerateRandomAgent(num_actions, action_dim, action_bound) else: oagent = OptionAgent(sess=None, obs_dim=state_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=1 + args.noptions, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=args.batchsize, buffer_size=args.buffersize, low_update_freq=args.lowupdatefreq, option_batch_size=args.obatchsize, option_buffer_size=args.obuffersize, high_update_freq=args.highupdatefreq, init_all=args.initall, init_around_goal=args.init_around_goal, init_dist=args.init_dist, term_dist=args.term_dist, name=agent_name) oagent.reset() if args.exp == 'evaloff': for nop in range(1, args.noptions + 1): op = CoveringOption(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, init_all=args.initall, init_around_goal=args.init_around_goal, init_dist=args.init_dist, term_dist=args.term_dist, restore=True, name='option' + str(nop) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) if args.reverse: opdir = prefix + '/vis/' + args.task + 'option' + str( nop) + 'rev_' + str(args.ffuncnunit) + '_' + str( args.rseed) else: opdir = prefix + '/vis/' + args.task + 'option' + str( nop) + '_' + str(args.ffuncnunit) + '_' + str( args.rseed) op.restore(opdir) print('restored option', opdir) oagent.add_option(op) else: print('evalon') mdp.reset() run_agents_on_mdp([oagent], mdp, episodes=args.nepisodes, steps=args.nsteps, instances=args.ninstances, cumulative_plot=True, verbose=args.verbose) else: print('No experiments run') ################################# # 4. Plot figures ################################# if args.exp == 'visop': plot_op(op, args, mdp, state_bound, opdir + '/eigenfunc.pdf') elif args.exp == 'vistraj' or args.exp == 'visterm': print(args.exp) samples = low_bfr.buffer size = low_bfr.size() trajectories = [] cur_o = None for i in range(size): s, _, _, _, t, o = samples[i][0], samples[i][1], samples[i][ 2], samples[i][3], samples[i][4], samples[i][5] if cur_o == args.noptions: if o == args.noptions and not t and i != size - 1: traj.append(s) else: # traj.append(s2) # if args.tasktype == 'pinball': # t = [s for s in traj if s.x != 0.2 or s.y != 0.2] # TODO: hack to remove the init state. # else: # t = traj if len(traj) > 10: trajectories.append((i, traj)) cur_o = 0 traj = [] else: if o == args.noptions: traj = [s] cur_o = args.noptions if len(trajectories) == 0: print('no trajectories sampled') if args.exp == 'visterm': terms = [traj[-1] for traj in trajectories] terms = terms[0:min(len(terms), 100)] # print('terms=', type(terms)) print('#terms=', len(terms)) if args.reverse: plot_terms(terms, mdp, args, filename=pathdir + '/' + 'terms' + 'rev') else: plot_terms(terms, mdp, args, filename=pathdir + '/' + 'terms') else: t = trajectories[1][1] plot_traj(t, mdp, args, filename=pathdir + '/' + 'traj' + str(1)) elif args.exp == 'visvis': print('visvis') samples = low_bfr.buffer traj = [samples[i][0] for i in range(low_bfr.size())] if mix_traj: samples2 = low_bfr2.buffer traj2 = [ samples2[i][0] for i in range(int(min(low_bfr2.size() / 2, len(traj) / 2))) ] traj = traj[:int(len(traj) / 2)] + traj2 plot_vis(traj, args, mdp, pathdir + '/visitation') elif args.exp == 'visfval': print('visfval') else: print('No plots') ################################# # 5. Save the results ################################# if args.exp == 'sample': print('save sample') if args.reverse: dirop = "rev" else: dirop = "" if args.saveimage: bfr.save(pathdir + '/traj_img' + dirop) low_bfr.save(pathdir + '/low_traj_img' + dirop) elif args.savecmp: bfr.save_sa(pathdir + '/traj_sa' + dirop) low_bfr.save_sao(pathdir + '/low_traj_sa' + dirop) else: bfr.save(pathdir + '/traj' + dirop) low_bfr.save(pathdir + '/low_traj' + dirop) elif args.exp == 'evaloff' or args.exp == 'evalon': print('save', args.exp) options = oagent.options for nop in range(1, len(options)): opdir = prefix + '/vis/' + args.task + 'option' + str( nop) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) if args.exp == 'evalon': opdir = opdir + '_online' options[nop].save(opdir + '_trained') oagent.option_buffer.save(pathdir + '_trained' + '/' + 'traj') oagent.experience_buffer.save(pathdir + '_trained' + '/' + 'low_traj') elif args.exp == 'generate': print('save generate') op.save(opdir) elif args.exp == 'train': print('save train') if args.reverse: op.save(opdir, rev=True) else: op.save(opdir) else: print('No save')