def test_replay_buffer_sample(self): templates = [('test1', tf.int32, ()), ('test2', tf.int32, (1, )), ('test3', tf.int32, (2, 2))] capacity = 5 rep_buf = replay_buffer.ReplayBuffer(templates, capacity) memory = [ tf.Variable(1, dtype=tf.int32, trainable=False), tf.Variable([2], dtype=tf.int32, trainable=False), tf.Variable([[1, 2], [3, 4]], dtype=tf.int32, trainable=False) ] with tf.control_dependencies([v.assign(v + 1) for v in memory]): inc_index_op = rep_buf.append(memory) samples_t = rep_buf.sample(5) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) sess.run(inc_index_op) sess.run(inc_index_op) sess.run(inc_index_op) sess.run(inc_index_op) sess.run(inc_index_op) samples = sess.run(samples_t) self.assertEqual(len(samples), 3) self.assertEqual(len(samples[0]), 5)
def test_replay_buffer_init(self): templates = [('test1', tf.int32, ()), ('test2', tf.int32, (1, )), ('test3', tf.int32, (2, 2))] capacity = 2 rep_buf = replay_buffer.ReplayBuffer(templates, capacity) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) buffers = sess.run(rep_buf.buffers) self.assertEqual(np.array_equal(buffers['test1'], [0, 0]), True) self.assertEqual(np.array_equal(buffers['test2'], [[0], [0]]), True) self.assertEqual( np.array_equal(buffers['test3'], [[[0, 0], [0, 0]], [[0, 0], [0, 0]]]), True)
def train(self): global_step = 0 if self.prioritized: self.memory = replay_buffer.PrioritizedReplayBuffer( self.replay_buffer_size, self.prioritized_alpha) self.beta_schedule = schedules.LinearSchedule( self.num_episodes, initial_p=self.prioritized_beta, final_p=0) else: self.memory = replay_buffer.ReplayBuffer(self.replay_buffer_size) self.beta_schedule = None for episode in range(self.num_episodes): global_step = self._episode(episode, global_step) # Save checkpoint if episode % self.save_frequency == 0: model_name = 'dqn_checkpoint_' + str(episode) + '.pth' torch.save(self.q_fn.state_dict(), os.path.join(self.model_path, model_name))
def test_replay_buffer_append(self): templates = [('test1', tf.int32, ()), ('test2', tf.int32, (1, )), ('test3', tf.int32, (2, 2))] capacity = 2 rep_buf = replay_buffer.ReplayBuffer(templates, capacity) memory = [ tf.constant(1), tf.constant([2]), tf.constant([[1, 2], [3, 4]]) ] inc_index_op = rep_buf.append(memory) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) sess.run(inc_index_op) index, buffers = sess.run([rep_buf.index, rep_buf.buffers]) self.assertEqual(index, 1) self.assertEqual(np.array_equal(buffers['test1'], [1, 0]), True) self.assertEqual(np.array_equal(buffers['test2'], [[2], [0]]), True) self.assertEqual( np.array_equal(buffers['test3'], [[[1, 2], [3, 4]], [[0, 0], [0, 0]]]), True)
if args.save_model and not os.path.exists("./models"): os.makedirs("./models") if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model if not os.path.exists(f"./models/{policy_file}"): assert f"The loading model path of `../models/{policy_file}` does not exist! " policy.load(f"./models/{policy_file}") # Setup loggers logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed, datestamp=False) logger = EpochLogger(**logger_kwargs) _replay_buffer = replay_buffer.ReplayBuffer(state_dim, action_dim) state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num = 0 start_time = time.time() for t in range(int(args.max_timesteps)): episode_timesteps += 1 # Select action randomly or according to policy if t < int(args.start_timesteps): action = env.action_space.sample() else: if args.policy.startswith("SAC"):
def __init__(self, sess, state_dim, action_dim, name, env): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.name = name self.env = env self.config = ConfigParser.ConfigParser() self.config.read("./config.ini") self.exploitability = 0 self.iteration = 0 # init parameters self.minibatch_size = int(self.config.get('Agent', 'MiniBatchSize')) self.n_hidden = int(self.config.get('Agent', 'HiddenLayer')) self.lr_br = float(self.config.get('Agent', 'LearningRateBR')) self.lr_ar = float(self.config.get('Agent', 'LearningRateAR')) self.epsilon = float(self.config.get('Agent', 'Epsilon')) self.epsilon_min = float(self.config.get('Agent', 'EpsilonMin')) self.gamma = float(self.config.get('Agent', 'Gamma')) self.omega = float(self.config.get('Agent', 'Omega')) self.sgd_br = SGD(lr=self.lr_br) self.sgd_ar = SGD(lr=self.lr_ar) self.target_model_update_rate = int(self.config.get('Agent', 'TargetModelUpdateRate')) self.iteration = 0 self.temp = (1 + 0.02 * np.sqrt(self.iteration))**(-1) # mixed anticipatory parameter self.eta = float(self.config.get('Agent', 'Eta')) # target network update counter self.target_br_model_update_count = 0 # reinforcement learning memory self._rl_memory = ReplayBuffer.ReplayBuffer(int(self.config.get('Utils', 'Buffersize')), int(self.config.get('Utils', 'Seed'))) # supervised learning memory self._sl_memory = ReservoirBuffer.ReservoirBuffer(int(self.config.get('Utils', 'Buffersize')), int(self.config.get('Utils', 'Seed'))) # build average strategy model self.avg_strategy_model = self._build_avg_response_model() # build dqn aka best response model self.best_response_model = self._build_best_response_model() self.target_br_model = self._build_best_response_model() self.target_br_model.set_weights(self.best_response_model.get_weights()) # build supervised learning model # self.average_response_model = self._build_avg_response_model() self.actions = np.zeros(3) self.played = 0 self.reward = 0 self.test_reward = 0 self.game_step = 0 # tensorBoard self.tensorboard_br = TensorBoard(log_dir='./logs/'+self.name+'rl', histogram_freq=0, write_graph=False, write_images=True) self.tensorboard_sl = TensorBoard(log_dir='./logs/'+self.name+'sl', histogram_freq=0, write_graph=False, write_images=True)
def build_actor(agent, env, level_name, action_set, id_actor): """Builds the actor loop.""" # Initial values. initial_env_output, initial_env_state = env.initial() initial_agent_state = agent.initial_state(1) initial_action = tf.zeros([1], dtype=tf.int32) # Run agent dummy_agent_output, _ = agent( (initial_action, nest.map_structure(lambda t: tf.expand_dims(t, 0), initial_env_output)), initial_agent_state) initial_agent_output = nest.map_structure( lambda t: tf.zeros(t.shape, t.dtype), dummy_agent_output) # Initialize buffer if bool_buffer: buffer = replay_buffer.ReplayBuffer(id_actor, FLAGS.unroll_length, FLAGS.buffer_size, initial_env_output, initial_agent_output) # All state that needs to persist across training iterations. This includes # the last environment output, agent state and last agent output. These # variables should never go on the parameter servers. def create_state(t): # Creates a unique variable scope to ensure the variable name is unique. with tf.variable_scope(None, default_name='state'): return tf.get_local_variable(t.op.name, initializer=t, use_resource=True) persistent_state = nest.map_structure( create_state, (initial_env_state, initial_env_output, initial_agent_state, initial_agent_output)) def step(input_, unused_i): """Steps through the agent and the environment.""" env_state, env_output, agent_state, agent_output = input_ # Run agent. action = agent_output[0] batched_env_output = nest.map_structure(lambda t: tf.expand_dims(t, 0), env_output) # Forward one-step agent_output, agent_state = agent((action, batched_env_output), agent_state) # Convert action index to the native action. action = agent_output[0][0] raw_action = tf.gather(action_set, action) env_output, env_state = env.step(raw_action, env_state) return env_state, env_output, agent_state, agent_output # Run the unroll. `read_value()` is needed to make sure later usage will # return the first values and not a new snapshot of the variables. first_values = nest.map_structure(lambda v: v.read_value(), persistent_state) _, first_env_output, first_agent_state, first_agent_output = first_values # Use scan to apply `step` multiple times, therefore unrolling the agent # and environment interaction for `FLAGS.unroll_length`. `tf.scan` forwards # the output of each call of `step` as input of the subsequent call of `step`. # The unroll sequence is initialized with the agent and environment states # and outputs as stored at the end of the previous unroll. # `output` stores lists of all states and outputs stacked along the entire # unroll. Note that the initial states and outputs (fed through `initializer`) # are not in `output` and will need to be added manually later. output = tf.scan(step, tf.range(FLAGS.unroll_length), first_values) _, env_outputs, _, agent_outputs = output # Update persistent state with the last output from the loop. assign_ops = nest.map_structure(lambda v, t: v.assign(t[-1]), persistent_state, output) # The control dependency ensures that the final agent and environment states # and outputs are stored in `persistent_state` (to initialize next unroll). with tf.control_dependencies(nest.flatten(assign_ops)): # Remove the batch dimension from the agent state/output. first_agent_state = nest.map_structure(lambda t: t[0], first_agent_state) first_agent_output = nest.map_structure(lambda t: t[0], first_agent_output) agent_outputs = nest.map_structure(lambda t: t[:, 0], agent_outputs) # Concatenate first output and the unroll along the time dimension. full_agent_outputs, full_env_outputs = nest.map_structure( lambda first, rest: tf.concat([[first], rest], 0), (first_agent_output, first_env_output), (agent_outputs, env_outputs)) # Append buffer if bool_buffer: op_assign_index = buffer.append(full_env_outputs, full_agent_outputs) with tf.control_dependencies([op_assign_index]): # Sample buffer env_outputs_vr, agent_outputs_vr = buffer.sample_sequence() env_outputs_pc, agent_outputs_pc = buffer.sample_sequence( shift=1) env_outputs_rp, agent_outputs_rp = buffer.sample_rp_sequence() is_full = buffer.is_full() with tf.control_dependencies([is_full]): output = ActorOutput(level_name=level_name, agent_state=first_agent_state, env_outputs=full_env_outputs, agent_outputs=full_agent_outputs, env_outputs_vr=env_outputs_vr, agent_outputs_vr=agent_outputs_vr, env_outputs_pc=env_outputs_pc, agent_outputs_pc=agent_outputs_pc, env_outputs_rp=env_outputs_rp, buffer_full=is_full) # No backpropagation should be done here. return nest.map_structure(tf.stop_gradient, output) else: output = ActorOutput(level_name=level_name, agent_state=first_agent_state, env_outputs=full_env_outputs, agent_outputs=full_agent_outputs) # No backpropagation should be done here. return nest.map_structure(tf.stop_gradient, output)
if args.save_model and not os.path.exists("./models"): os.makedirs("./models") if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model if not os.path.exists(f"./models/{policy_file}"): assert f"The loading model path of `../models/{policy_file}` does not exist! " policy.load(f"./models/{policy_file}") # Setup loggers logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed, datestamp=False) logger = EpochLogger(**logger_kwargs) _replay_buffer = replay_buffer.ReplayBuffer(int(args.buffer_size)) print("Collecting experience...") epinfobuf = deque(maxlen=100) # episode step for accumulate reward start_time = time.time() # check learning time states = np.array( env.reset()) # env reset, output array of num of `#num_envs` states step = 0 for t in range(1, int(args.max_timesteps) // int(args.num_envs) + 1): actions = policy.select_action(states, eps_schedule.value) next_states, rewards, dones, infos = env.step( actions) # take actions and get next states next_states = np.array(next_states) # log arrange