def __init__(self, gymEnvName, model_weights, k, intrinsic=False): self.env = gym.make(gymEnvName) g = tf.Graph() with g.as_default(): model = AtariVisionModel(k, actiondim=(self.env.action_space.n, 1)) model.sess.run(tf.initialize_all_variables()) variables = TensorFlowVariables(model.loss, model.sess) variables.set_weights(model_weights) self.model = model self.trajectory_set = set() self.intrinsic = intrinsic self.real_action_space = self.env.action_space.n #print(self.real_action_space) self.action_space = spaces.Discrete(self.env.action_space.n + model.k) #print("####",self.env.action_space.n + model.k) self.obs = None self.done = False self.spec = self.env.spec
def initialize(self): with self.g.as_default(): self.sess = tf.Session(graph=self.g, config=tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=2)) self.variables = TensorFlowVariables(self.loss, self.sess) self.sess.run(tf.global_variables_initializer())
def __init__(self, computation_graph_args, sample_trajectory_args, estimate_return_args): super().__init__(computation_graph_args, sample_trajectory_args, estimate_return_args) # build computation graph self.build_computation_graph() # tensorflow: config, session, variable initialization self.init_tf_sess() self.variables = TensorFlowVariables(self.loss, self.sess)
def runDDO(env_name="PongDeterministic-v3", num_options=2, ddo_learning_rate=1e-3, steps_per_discovery=10000, rounds=1, num_demonstrations_per=100, ddo_max_iters=100, ddo_vq_iters=100, num_workers=1, JSD_weight=0, entropy_weight=0): g = tf.Graph() #initialize graph with g.as_default(): a = AtariVisionModel(num_options, actiondim=(gym.make(env_name).action_space.n,1)) variables = TensorFlowVariables(a.loss, a.sess) with tf.variable_scope("optimizer2"): opt = tf.train.AdamOptimizer(learning_rate=ddo_learning_rate) a.sess.run(tf.initialize_all_variables()) weights = variables.get_weights() #run once to initialize env, policy = train(num_workers, env_name=env_name, model=weights, k=num_options, max_steps=1, intrinsic=False) trajs,_ = collect_demonstrations(env, policy, N=num_demonstrations_per) for i in range(rounds): with g.as_default(): with tf.variable_scope("optimizer2"): vq = ddo_vq_iters if i > 0: vq = 0 a.train(opt, trajs, ddo_max_iters, vq) weights = variables.get_weights() env, policy = train(num_workers, policy=policy, env_name=env_name, model=weights, k=num_options, max_steps=steps_per_discovery) trajs, reward = collect_demonstrations(env, policy, N=num_demonstrations_per) return {'reward': reward, 'env': env_name, 'num_options': num_options, 'ddo': True, 'intrinsic': False}
def gridWorldInit(): t = tf.Graph() with t.as_default(): m = GridWorldModel(2, statedim=(2,1)) m.sess.run(tf.initialize_all_variables()) variables = TensorFlowVariables(m.loss, m.sess) return m, m.opt, t, variables
def build_model(self, args, **kwargs): self.ppo_method = { 'kl_pen': dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty 'clip': dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better }.get(args.ppo_name) config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=True ) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.tfs = tf.placeholder(tf.float32, [None, args.state_dim], 'state') # critic with tf.variable_scope('critic'): l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu) self.v = tf.layers.dense(l1, 1) self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r') self.advantage = self.tfdc_r - self.v self.closs = tf.reduce_mean(tf.square(self.advantage)) self.critic_opt = tf.train.AdamOptimizer(args.critic_lr) self.c_grads = self.critic_opt.compute_gradients(self.closs) self.ctrain_op = self.critic_opt.apply_gradients(self.c_grads) self.c_vars = TensorFlowVariables(self.closs, self.sess) # actor pi, pi_params = self._build_anet('pi', trainable=True) oldpi, oldpi_params = self._build_anet('oldpi', trainable=False) with tf.variable_scope('sample_action'): self.sample_op = tf.squeeze(pi.sample(1), axis=0) # choosing action with tf.variable_scope('update_oldpi'): self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)] self.tfa = tf.placeholder(tf.float32, [None, args.action_dim], 'action') self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage') with tf.variable_scope('loss'): with tf.variable_scope('surrogate'): # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa)) ratio = pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5) surr = ratio * self.tfadv if args.ppo_name == 'kl_pen': self.tflam = tf.placeholder(tf.float32, None, 'lambda') kl = tf.distributions.kl_divergence(oldpi, pi) self.kl_mean = tf.reduce_mean(kl) self.aloss = -(tf.reduce_mean(surr - self.tflam * kl)) else: # clipping method, find this is better self.aloss = -tf.reduce_mean(tf.minimum( surr, tf.clip_by_value(ratio, 1. - self.ppo_method['epsilon'], 1. + self.ppo_method['epsilon']) * self.tfadv)) with tf.variable_scope('actor_train'): self.actor_opt = tf.train.AdamOptimizer(args.actor_lr) self.a_grads = self.actor_opt.compute_gradients(self.aloss) self.a_grads = [(t, v) for t, v in self.a_grads if t is not None] self.atrain_op = self.actor_opt.apply_gradients(self.a_grads) self.a_vars = TensorFlowVariables(self.aloss, self.sess) if args.log_dir: tf.summary.FileWriter(args.log_dir, self.sess.graph) self.sess.run(tf.global_variables_initializer())
class PPO(object): """ This PPO version is adapted from Mofan Zhou, University of Technology Sydney. """ def __init__(self, args): self.args = args self.build_model(args) @staticmethod def add_args(parser): parser.add_argument('--state_dim', type=int, default=3) parser.add_argument('--action_dim', type=int, default=1) # parser.add_argument('--update_actor_steps', type=int, default=10) # unused in ray # parser.add_argument('--update_critic_steps', type=int, default=10) # unused in ray parser.add_argument('--actor_lr', type=float, default=1e-4) parser.add_argument('--critic_lr', type=float, default=2e-4) parser.add_argument('--clip_norm', type=float, default=5) parser.add_argument('--ppo_name', choices=['kl_gen', 'clip'], default='clip') parser.add_argument("--log_dir", type=str, default=None) def build_model(self, args, **kwargs): self.ppo_method = { 'kl_pen': dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty 'clip': dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better }.get(args.ppo_name) config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=True ) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.tfs = tf.placeholder(tf.float32, [None, args.state_dim], 'state') # critic with tf.variable_scope('critic'): l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu) self.v = tf.layers.dense(l1, 1) self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r') self.advantage = self.tfdc_r - self.v self.closs = tf.reduce_mean(tf.square(self.advantage)) self.critic_opt = tf.train.AdamOptimizer(args.critic_lr) self.c_grads = self.critic_opt.compute_gradients(self.closs) self.ctrain_op = self.critic_opt.apply_gradients(self.c_grads) self.c_vars = TensorFlowVariables(self.closs, self.sess) # actor pi, pi_params = self._build_anet('pi', trainable=True) oldpi, oldpi_params = self._build_anet('oldpi', trainable=False) with tf.variable_scope('sample_action'): self.sample_op = tf.squeeze(pi.sample(1), axis=0) # choosing action with tf.variable_scope('update_oldpi'): self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)] self.tfa = tf.placeholder(tf.float32, [None, args.action_dim], 'action') self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage') with tf.variable_scope('loss'): with tf.variable_scope('surrogate'): # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa)) ratio = pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5) surr = ratio * self.tfadv if args.ppo_name == 'kl_pen': self.tflam = tf.placeholder(tf.float32, None, 'lambda') kl = tf.distributions.kl_divergence(oldpi, pi) self.kl_mean = tf.reduce_mean(kl) self.aloss = -(tf.reduce_mean(surr - self.tflam * kl)) else: # clipping method, find this is better self.aloss = -tf.reduce_mean(tf.minimum( surr, tf.clip_by_value(ratio, 1. - self.ppo_method['epsilon'], 1. + self.ppo_method['epsilon']) * self.tfadv)) with tf.variable_scope('actor_train'): self.actor_opt = tf.train.AdamOptimizer(args.actor_lr) self.a_grads = self.actor_opt.compute_gradients(self.aloss) self.a_grads = [(t, v) for t, v in self.a_grads if t is not None] self.atrain_op = self.actor_opt.apply_gradients(self.a_grads) self.a_vars = TensorFlowVariables(self.aloss, self.sess) if args.log_dir: tf.summary.FileWriter(args.log_dir, self.sess.graph) self.sess.run(tf.global_variables_initializer()) @ray.method(num_return_vals=2) def step(self, s, a, r, a_weights, c_weights): self.set_weights(a_weights, c_weights) self.sess.run(self.update_oldpi_op) adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r}) # update actor a_grads = self.sess.run([grad[0] for grad in self.a_grads], {self.tfs: s, self.tfa: a, self.tfadv: adv}) # update critic c_grads = self.sess.run([grad[0] for grad in self.c_grads], {self.tfs: s, self.tfdc_r: r}) return a_grads, c_grads def _build_anet(self, name, trainable): with tf.variable_scope(name): l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu, trainable=trainable) mu = 2 * tf.layers.dense(l1, self.args.action_dim, tf.nn.tanh, trainable=trainable) sigma = tf.layers.dense(l1, self.args.action_dim, tf.nn.softplus, trainable=trainable) norm_dist = tf.distributions.Normal(loc=mu, scale=sigma) params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name) return norm_dist, params def choose_actions(self, s: np.ndarray) -> List[np.ndarray]: if s.ndim == 1: s = s[np.newaxis, :] bat_actions = self.sess.run(self.sample_op, {self.tfs: s}) return [np.clip(a, -2, 2) for a in bat_actions] def get_v(self, s): if s.ndim < 2: s = s[np.newaxis, :] return self.sess.run(self.v, {self.tfs: s})[0, 0] @ray.method(num_return_vals=2) def get_weights(self): return self.a_vars.get_weights(), self.c_vars.get_weights() def set_weights(self, a_weights, c_weights): self.a_vars.set_weights(a_weights) self.c_vars.set_weights(c_weights)
m = GridWorldModel(2, statedim=(2,1)) m.sess.run(tf.initialize_all_variables()) with tf.variable_scope("optimizer"): opt = m.opt #opt = tf.train.AdamOptimizer(learning_rate=0.001) m.train(opt, dataset, 1, 0) driver_gradients = opt.compute_gradients(m.loss) variables = TensorFlowVariables(m.loss, m.sess) vlist = [g for (g,v) in driver_gradients] update = opt.apply_gradients(driver_gradients) variables = TensorFlowVariables(m.loss, m.sess) for iter in range(0,1000): print("----Iteration---", iter) weights = ray.put(variables.get_weights())
class Parallel_Actor(Agent): def __init__(self, computation_graph_args, sample_trajectory_args, estimate_return_args): super().__init__(computation_graph_args, sample_trajectory_args, estimate_return_args) # build computation graph self.build_computation_graph() # tensorflow: config, session, variable initialization self.init_tf_sess() self.variables = TensorFlowVariables(self.loss, self.sess) def set_weights(self, weights): self.variables.set_weights(weights) def get_weights(self): weights = self.variables.get_weights() return weights def test_method(self, i): return i * 2 def sample_trajectories_fake(self, itr, env, counter_actor): # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and self.animate) if hasattr(self, 'running_only') and self.animate: animate_this_episode = True # path = self.sample_trajectory(env, animate_this_episode) # paths.append(path) # timesteps_this_batch += pathlength(path) path_len = self.fake_sample_trajectory() counter_actor.increment_counter.remote(path_len) current_count = ray.get(counter_actor.return_count.remote()) timesteps_this_batch += path_len time.sleep(0.3) # if timesteps_this_batch > self.min_timesteps_per_batch: if current_count > self.min_timesteps_per_batch: print('the final pathlength from this worker is ' + str(timesteps_this_batch)) paths = 'filler' break return paths, timesteps_this_batch # def sample_trajectories(self, itr, env, counter_actor): # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and self.animate) if hasattr(self, 'running_only') and self.animate: animate_this_episode = True path = self.sample_trajectory(env, animate_this_episode) path_len = pathlength(path) paths.append(path) timesteps_this_batch += path_len counter_actor.increment_counter.remote(path_len) current_count = ray.get(counter_actor.return_count.remote()) if current_count > self.min_timesteps_per_batch: break return paths, timesteps_this_batch def fake_sample_trajectory(self): # return randint(100) return 100
def __init__(self, name, args, env_args, buffer_args, sess_config=None, save=False, log=False, log_tensorboard=False, log_params=False, log_stats=False, device=None): # hyperparameters self.algo = args['algorithm'] self.gamma = args.setdefault('gamma', .99) self.update_step = 0 self.max_action_repetitions = args.setdefault('max_action_repetitions', 1) # environment info env_args['gamma'] = self.gamma env_args['seed'] += 100 self.eval_env = create_gym_env(env_args) env_args['seed'] -= 100 env_args['log_video'] = False self.train_env = create_gym_env(env_args) self.state_shape = self.train_env.state_shape self.action_dim = self.train_env.action_dim # replay buffer hyperparameters buffer_args['n_steps'] = args['n_steps'] buffer_args['gamma'] = args['gamma'] buffer_args['batch_size'] = args['batch_size'] self.buffer_type = buffer_args['type'] if self.buffer_type == 'proportional': self.buffer = ProportionalPrioritizedReplay( buffer_args, self.state_shape, self.action_dim) elif self.buffer_type == 'uniform': self.buffer = UniformReplay(buffer_args, self.state_shape, self.action_dim) elif self.buffer_type == 'local': self.buffer = LocalBuffer(buffer_args, self.state_shape, self.action_dim) else: raise NotImplementedError('No buffer is constructed') # arguments for prioritized replay self.prio_alpha = float(buffer_args['alpha']) self.prio_epsilon = float(buffer_args['epsilon']) super().__init__(name, args, sess_config=sess_config, save=save, log=log, log_tensorboard=log_tensorboard, log_params=log_params, log_stats=log_stats, device=device) self._initialize_target_net() with self.graph.as_default(): self.variables = TensorFlowVariables(self.loss, self.sess)
def __init__(self, name, args, env_args, sess_config=None, save=False, log=False, log_tensorboard=False, log_params=False, log_stats=False, device=None, reuse=None, graph=None): # hyperparameters self.gamma = args['gamma'] self.gae_discount = self.gamma * args['lam'] self.n_minibatches = args['n_minibatches'] self.use_lstm = args['ac']['use_lstm'] self.entropy_coef = args['ac']['entropy_coef'] self.n_value_updates = args['ac']['n_value_updates'] self.minibatch_idx = 0 # environment info self.env_vec = create_gym_env(env_args) self.seq_len = self.env_vec.max_episode_steps self.buffer = PPOBuffer(env_args['n_workers'] * env_args['n_envs'], self.seq_len, self.n_minibatches, self.env_vec.state_shape, np.float32, self.env_vec.action_shape, np.float32) super().__init__(name, args, sess_config=sess_config, save=save, log=log, log_tensorboard=log_tensorboard, log_params=log_params, log_stats=log_stats, device=device, reuse=reuse, graph=graph) self.schedule_lr = 'schedule_lr' in args and args['schedule_lr'] if self.schedule_lr: self.actor_lr_scheduler = PiecewiseSchedule([(0, 1e-4), (400000, 1e-4), (600000, 5e-5)], outside_value=5e-5) self.critic_lr_scheduler = PiecewiseSchedule([(0, 3e-4), (400000, 3e-4), (600000, 5e-5)], outside_value=5e-5) if self.use_lstm: # don't distinguish lstm at training from that at running # since training is done after running self.last_lstm_state = None with self.graph.as_default(): self.variables = TensorFlowVariables( [self.ac.policy_loss, self.ac.V_loss], self.sess)
class Policy(object): """Policy base class""" def __init__(self, ob_space, ac_space, task, name="local"): self.local_steps = 0 worker_device = "/job:localhost/replica:0/task:0/cpu:0" self.g = tf.Graph() with self.g.as_default(), tf.device(worker_device): with tf.variable_scope(name): self.setup_graph(ob_space, ac_space) assert all([ hasattr(self, attr) for attr in ["vf", "logits", "x", "var_list"] ]) print("Setting up loss") self.setup_loss(ac_space) self.initialize() def setup_graph(self): raise NotImplementedError def setup_loss(self, num_actions, summarize=True): self.ac = tf.placeholder(tf.float32, [None, num_actions], name="ac") self.adv = tf.placeholder(tf.float32, [None], name="adv") self.r = tf.placeholder(tf.float32, [None], name="r") log_prob_tf = tf.nn.log_softmax(self.logits) prob_tf = tf.nn.softmax(self.logits) # the "policy gradients" loss: its derivative is precisely the policy gradient # notice that self.ac is a placeholder that is provided externally. # adv will contain the advantages, as calculated in process_rollout pi_loss = -tf.reduce_sum( tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv) # loss of value function vf_loss = 0.5 * tf.reduce_sum(tf.square(self.vf - self.r)) vf_loss = tf.Print(vf_loss, [vf_loss], "Value Fn Loss") entropy = -tf.reduce_sum(prob_tf * log_prob_tf) bs = tf.to_float(tf.shape(self.x)[0]) self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01 grads = tf.gradients(self.loss, self.var_list) self.grads, _ = tf.clip_by_global_norm(grads, 40.0) grads_and_vars = list(zip(self.grads, self.var_list)) opt = tf.train.AdamOptimizer(1e-4) self._apply_gradients = opt.apply_gradients(grads_and_vars) if summarize: tf.summary.scalar("model/policy_loss", pi_loss / bs) tf.summary.scalar("model/value_loss", vf_loss / bs) tf.summary.scalar("model/entropy", entropy / bs) tf.summary.image("model/state", self.x) self.summary_op = tf.summary.merge_all() def initialize(self): with self.g.as_default(): self.sess = tf.Session(graph=self.g, config=tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=2)) self.variables = TensorFlowVariables(self.loss, self.sess) self.sess.run(tf.global_variables_initializer()) def model_update(self, grads): feed_dict = {self.grads[i]: grads[i] for i in range(len(grads))} self.sess.run(self._apply_gradients, feed_dict=feed_dict) def get_weights(self): with self.g.as_default(): weights = self.variables.get_weights() return weights def set_weights(self, weights): with self.g.as_default(): self.variables.set_weights(weights) def get_gradients(self, batch): raise NotImplementedError def get_vf_loss(self): raise NotImplementedError def act(self, ob): raise NotImplementedError def value(self, ob): raise NotImplementedError