def test_buffer(self): buffer_size = 256 obs_shape = (15,15) act_dim = 5 N = 512 erb = ReplayBuffer(buffer_size,{"obs":{"shape": obs_shape}, "act":{"shape": act_dim}, "rew":{}, "next_obs":{"shape": obs_shape}, "done":{}}) for i in range(N): obs = np.full(obs_shape,i,dtype=np.double) act = np.full(act_dim,i,dtype=np.double) rew = i next_obs = obs + 1 done = 0 erb.add(obs=obs,act=act,rew=rew,next_obs=next_obs,done=done) es = erb._encode_sample(range(buffer_size)) erb.sample(32) erb.clear() self.assertEqual(erb.get_next_index(),0) self.assertEqual(erb.get_stored_size(),0)
def test(self): buffer_size = 256 obs_dim = 3 act_dim = 1 rb = ReplayBuffer( buffer_size, { "obs": { "shape": obs_dim }, "act": { "shape": act_dim }, "rew": {}, "next_obs": { "shape": obs_dim }, "done": {} }) obs = np.ones(shape=(obs_dim)) act = np.ones(shape=(act_dim)) rew = 0 next_obs = np.ones(shape=(obs_dim)) done = 0 for i in range(500): rb.add(obs=obs, act=act, rew=rew, next_obs=next_obs, done=done) batch_size = 32 sample = rb.sample(batch_size)
class Agent: def __init__(self, learn_rate, state_shape, num_actions, batch_size): self.mem_size=100000 self.gamma = 0.99 self.action_space = list(range(num_actions)) self.batch_size = batch_size self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000) self.memory = ReplayBuffer( self.mem_size, { "obs": { "shape": state_shape }, "act": { "shape": 1 }, "rew": { }, "next_obs": { "shape": state_shape }, "done": { "shape": 1 }}) self.net = Network(learn_rate, state_shape, num_actions) def choose_action(self, observation): state = torch.tensor(observation).float().detach() state = state.to(self.net.device) state = state.unsqueeze(0) q_values = self.net(state) action = torch.argmax(q_values).item() return action def store_memory(self, state, action, reward, next_state, done): self.memory.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done) def learn(self): if self.memory.get_stored_size() < self.batch_size: return batch = self.memory.sample(self.batch_size) states = torch.tensor( batch["obs"] ).to(self.net.device) actions = torch.tensor( batch["act"], dtype=torch.int64).to(self.net.device).T[0] rewards = torch.tensor( batch["rew"] ).to(self.net.device).T[0] states_ = torch.tensor( batch["next_obs"] ).to(self.net.device) dones = torch.tensor( batch["done"], dtype=torch.bool ).to(self.net.device).T[0] batch_index = np.arange(self.batch_size, dtype=np.int64) q_values = self.net(states)[batch_index, actions] q_values_ = self.net(states_) action_qs_ = torch.max(q_values_, dim=1)[0] action_qs_[dones] = 0.0 q_target = rewards + self.gamma * action_qs_ td = q_target - q_values self.net.optimizer.zero_grad() loss = (td ** 2.0).mean() loss.backward() self.net.optimizer.step() self.net.reset_noise()
def test_train(self): agent = DQN( state_shape=self.env.observation_space.shape, action_dim=self.env.action_space.n, memory_capacity=100, gpu=-1) from cpprb import ReplayBuffer replay_buffer = ReplayBuffer( obs_dim=self.env.observation_space.shape, act_dim=1, size=agent.memory_capacity) obs = self.env.reset() for _ in range(100): action = agent.get_action(obs) next_obs, reward, done, _ = self.env.step(action) replay_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done) if done: next_obs = self.env.reset() obs = next_obs for _ in range(100): samples = replay_buffer.sample(agent.batch_size) agent.train(samples["obs"], samples["act"], samples["next_obs"], samples["rew"], np.array(samples["done"], dtype=np.float64))
class buffer_class: def __init__(self, max_length, seed_number, env): env_dict = create_env_dict(env) #override the observation length in the replay memory env_dict['obs'] = {"dtype": numpy.float32, "shape": (17, )} env_dict['next_obs'] = {"dtype": numpy.float32, "shape": (17, )} print('!!!!', env_dict['obs']) self.before_add = create_before_add_func(env) self.storage = ReplayBuffer(max_length, env_dict) def append(self, s, a, r, done, sp): self.storage.add( **self.before_add(obs=s, act=a, rew=r, done=done, next_obs=sp)) def sample(self, batch_size): batch = self.storage.sample(batch_size) s_matrix = batch['obs'] a_matrix = batch['act'] r_matrix = batch['rew'] done_matrix = batch['done'] sp_matrix = batch['next_obs'] return s_matrix, a_matrix, r_matrix, done_matrix, sp_matrix def __len__(self): return self.storage.get_stored_size()
class ReplayBuffer: def __init__(self, size, env_dict, n_step_dict=None, min_storage=10000, done_string="done"): super().__init__() self.done_string = done_string self.min_storage = min_storage cpprb_args = { "size": size, "env_dict": env_dict, "Nstep": n_step_dict } self.buffer = CPPRB(**cpprb_args) def add(self, data: Sequence[Dict[str, np.ndarray]]) -> None: for d in data: self.buffer.add(**d) if d[self.done_string]: self.buffer.on_episode_end() def sample(self, size: int) -> Dict[str, np.ndarray]: if self.buffer.get_stored_size() < self.min_storage: print( f"stored sample {self.buffer.get_stored_size()} is smaller than mininum storage" + f"size {self.min_storage}. Returning None." ) return None else: return self.buffer.sample(size)
def test_ReplayBuffer_with_single_step(self): buffer_size = 256 obs_shape = (3, 4) batch_size = 10 rb = ReplayBuffer(buffer_size, {"obs": {"shape": obs_shape}}) v = {"obs": np.ones(shape=obs_shape)} rb.add(**v) rb.sample(batch_size) for _ in range(100): rb.add(**v) rb.sample(batch_size)
def test_next_obs(self): buffer_size = 256 obs_shape = (15, 15) act_dim = 5 rb = ReplayBuffer(buffer_size, { "obs": { "shape": obs_shape, "dtype": np.ubyte }, "act": { "shape": act_dim }, "rew": {}, "done": {} }, next_of="obs") self.assertEqual(rb.get_next_index(), 0) self.assertEqual(rb.get_stored_size(), 0) obs = np.zeros(obs_shape, dtype=np.ubyte) act = np.ones(act_dim) rew = 1 done = 0 rb.add(obs=obs, act=act, rew=rew, next_obs=obs, done=done) self.assertEqual(rb.get_next_index(), 1) self.assertEqual(rb.get_stored_size(), 1) with self.assertRaises(KeyError): rb.add(obs=obs) self.assertEqual(rb.get_next_index(), 1) self.assertEqual(rb.get_stored_size(), 1) next_obs = rb.sample(32)["next_obs"] for i in range(512): obs = np.ones(obs_shape, dtype=np.ubyte) * i rb.add(obs=obs, act=act, rew=rew, next_obs=obs + 1, done=done) sample = rb._encode_sample(range(buffer_size)) ith = rb.get_next_index() np.testing.assert_allclose( np.roll(sample["obs"], -ith - 1, axis=0)[1:], np.roll(sample["next_obs"], -ith - 1, axis=0)[:-1])
class buffer_class: def __init__(self, max_length, seed_number, env): env_dict = create_env_dict(env) self.before_add = create_before_add_func(env) self.storage = ReplayBuffer(max_length, env_dict) def append(self, s, a, r, done, sp): self.storage.add( **self.before_add(obs=s, act=a, rew=r, done=done, next_obs=sp)) def sample(self, batch_size): batch = self.storage.sample(batch_size) s_matrix = batch['obs'] a_matrix = batch['act'] r_matrix = batch['rew'] done_matrix = batch['done'] sp_matrix = batch['next_obs'] return s_matrix, a_matrix, r_matrix, done_matrix, sp_matrix def __len__(self): return self.storage.get_stored_size()
def main(): s_dim = 4 a_dim = 2 batch_size = 64 env = "../envs/point_mass2d.xml" sim = Simulation(env, s_dim, a_dim, None, False) length = 500 rb = ReplayBuffer(length, env_dict={"obs": {"shape": (s_dim, 1)}, "act": {"shape": (a_dim, 1)}, "rew": {}, "next_obs": {"shape": (s_dim, 1)}, "done": {}}) x = sim.getState() for _ in range(length): u = np.random.rand(1, a_dim, 1) x_next = sim.step(u) rb.add(obs=x, act=u, rew=0, next_obs=x_next, done=False) x = x_next model = NNModel(dt=0.1, state_dim=s_dim, action_dim=a_dim, name="nn_model") stamp = datetime.now().strftime("%Y.%m.%d-%H:%M:%S") logdir = "../graphs/test_training/{}".format(stamp) writer = tf.summary.create_file_writer(logdir) log = True epochs = 1000 for e in range(epochs): sample = rb.sample(batch_size) gt = sample['next_obs'] x = sample['obs'] u = sample['act'] model.train_step(gt, x, u, e, writer, log)
def test(self): buffer_size = 256 obs_dim = 3 act_dim = 1 rew_dim = 2 rb = ReplayBuffer( buffer_size, { "obs": { "shape": obs_dim }, "act": { "shape": act_dim }, "rew": { "shape": rew_dim }, "next_obs": { "shape": obs_dim }, "done": {} }) obs = np.ones(shape=(obs_dim)) act = np.ones(shape=(act_dim)) rew = (0, 1) next_obs = np.ones(shape=(obs_dim)) done = 0 for i in range(500): rb.add(obs=obs, act=act, rew=rew, next_obs=next_obs, done=done) batch_size = 32 sample = rb.sample(batch_size) self.assertEqual(0, sample["rew"][0, 0]) self.assertEqual(1, sample["rew"][0, 1])
class Server(Process): def __init__(self, size, env_dict, n_step_dict=None, min_storage=10000, done_string="done"): super().__init__() self.done_string = done_string self.queue = Queue() self.size = size self.client_pipe, self.server_pipe = Pipe() self.env_dict = env_dict self.n_step_dict = n_step_dict self.parameter = None self.min_storage = min_storage self.cpprb_args = { "size": size, "env_dict": env_dict, "Nstep": n_step_dict } # Server lock object self.lock = Lock() def run(self) -> None: self.buffer = CPPRB( **self.cpprb_args) while True: cmd, *args = self.queue.get() if cmd == "add": self._add(*args) elif cmd == "sample": self.server_pipe.send(self._sample(*args)) elif cmd == "upload": self._upload(*args) elif cmd == "download": self.server_pipe.send(self._download()) else: raise ValueError( f"Parameter Server got an unexpected command {cmd}") def _download(self) -> Any: return self.parameter def _upload(self, parameter: Any) -> None: self.parameter = parameter def _add(self, data: Dict[str, Sequence[np.ndarray]]) -> None: self.buffer.add(**data) def _sample(self, size: int) -> Dict[str, np.ndarray]: if self.buffer.get_stored_size() < self.min_storage: print( f"stored sample {self.buffer.get_stored_size()} is smaller than mininum storage\ size {self.min_storage}. Returning None.") return None else: return self.buffer.sample(size) def download(self) -> Any: cmd = "download" self.lock.acquire() self.queue.put((cmd, None)) weights = self.client_pipe.recv() self.lock.release() return weights def upload(self, parameter: Any): cmd = "upload" self.queue.put((cmd, parameter)) def add(self, data: Sequence[Dict[str, np.ndarray]]): cmd = "add" self.queue.put((cmd, data)) def sample(self, size: int) -> Dict[str, np.ndarray]: cmd = "sample" self.lock.acquire() self.queue.put((cmd, size)) sample = self.client_pipe.recv() self.lock.release() return sample
class Server(Process): def __init__(self, size, env_dict, min_storage=100): super().__init__() self.queue = Queue() self.size = size self.client_pipe, self.server_pipe = Pipe() self.env_dict = env_dict self.parameter = None self.min_storage = min_storage # サーバーロックオブジェクト self.lock = Lock() def run(self): self.buffer = CPPRB(self.size, env_dict=self.env_dict) while True: cmd, *args = self.queue.get() if cmd == "add": self._add(*args) elif cmd == "sample": self.server_pipe.send(self._sample(*args)) elif cmd == "upload": self._upload(*args) elif cmd == "download": self.server_pipe.send(self._download()) else: raise ValueError( f"Parameter Server got an unexpected command {cmd}") def _download(self): return self.parameter def _upload(self, parameter): self.parameter = parameter def _add(self, data): for d in data: label_array = list(self.env_dict.keys()) data_dict = {key: value for key, value in zip(label_array, d)} self.buffer.add(**data_dict) def _sample(self, size): if self.buffer.get_stored_size() < self.min_storage: print( f"stored sample {self.buffer.get_stored_size()} is smaller than mininum storage\ size {self.min_storage}. Returning None") return None else: return self.buffer.sample(size) def download(self): cmd = "download" self.lock.acquire() self.queue.put((cmd, None)) weights = self.client_pipe.recv() self.lock.release() return weights def upload(self, parameter): cmd = "upload" self.queue.put((cmd, parameter)) def add(self, data): cmd = "add" self.queue.put((cmd, data)) def sample(self, size): cmd = "sample" self.lock.acquire() self.queue.put((cmd, size)) sample = self.client_pipe.recv() self.lock.release() return sample
while not done_training.is_set(): while not exp_queue.empty(): state = exp_queue.get() total_rewards.append(state['rew']) global_rb.add(**state) del state if global_rb.get_stored_size() < params.init_replay: continue if (datetime.now() - start).seconds > 3: mean = np.mean(total_rewards[-100:]) print( f'{frames.value:7,} done: {episodes.value:5} mean: {mean:.3f}') start = datetime.now() batch = global_rb.sample(params.batch_size) optimizer.zero_grad() loss = calc_loss_dqn(batch, net, tgt_net, params.gamma, device, False) loss.backward() optimizer.step() del batch if frames.value % params.sync_nets == 0: tgt_net.sync() if mean > 10: done_training.set() exp_queue.close() exp_queue.join_thread() for p in procs:
else: Q = tf.squeeze(model(observation.reshape(1, -1))) action = np.argmax(Q) egreedy = decay_egreedy(egreedy) next_observation, reward, done, info = env.step(action) rb.add(obs=observation, act=action, rew=reward, next_obs=next_observation, done=done) observation = next_observation # Uniform sampling sample = rb.sample(batch_size * m) with tf.GradientTape() as tape: tape.watch(model.trainable_weights) Q = Q_func(model, tf.constant(sample["obs"]), tf.constant(sample["act"].ravel()), tf.constant(env.action_space.n)) target_Q = tf.stop_gradient( target_func(model, target_model, tf.constant(sample['next_obs']), tf.constant(sample["rew"].ravel()), tf.constant(sample["done"].ravel()), discount, tf.constant(env.action_space.n))) tf.summary.scalar("Target Q", data=tf.reduce_mean(target_Q), step=n_step) absTD = tf.math.abs(target_Q - Q)
class MPCTrainer(Trainer): def __init__(self, policy, env, args, reward_fn, buffer_size=int(1e6), n_dynamics_model=1, lr=0.001, **kwargs): super().__init__(policy, env, args, **kwargs) self.dynamics_buffer = ReplayBuffer( **self._prepare_dynamics_buffer_dict(buffer_size=buffer_size)) self._n_dynamics_model = n_dynamics_model # Reward function self._reward_fn = reward_fn self._prepare_dynamics_model(gpu=args.gpu, lr=lr) def _prepare_dynamics_buffer_dict(self, buffer_size): # Prepare buffer that stores transitions (s, a, s') rb_dict = { "size": buffer_size, "default_dtype": np.float32, "env_dict": { "obs": { "shape": get_space_size(self._env.observation_space) }, "next_obs": { "shape": get_space_size(self._env.observation_space) }, "act": { "shape": get_space_size(self._env.action_space) } } } return rb_dict def _prepare_dynamics_model(self, gpu=0, lr=0.001): # Dynamics model obs_dim = self._env.observation_space.high.size act_dim = self._env.action_space.high.size self._dynamics_models = [ DynamicsModel(input_dim=obs_dim + act_dim, output_dim=obs_dim, gpu=gpu) for _ in range(self._n_dynamics_model) ] self._optimizers = [ tf.keras.optimizers.Adam(learning_rate=lr) for _ in range(self._n_dynamics_model) ] def _set_check_point(self, model_dir): # Save and restore model if isinstance(self._policy, tf.keras.Model): super()._set_check_point(model_dir) def __call__(self): total_steps = 0 tf.summary.experimental.set_step(total_steps) # Gather dataset of random trajectories self.logger.info("Ramdomly collect {} samples...".format( self._n_random_rollout * self._episode_max_steps)) self.collect_episodes(n_rollout=self._n_random_rollout) for i in range(self._max_iter): # Train dynamics f(s, a) according to eq.(2) mean_loss = self.fit_dynamics(n_epoch=1) total_rew = 0. # Collect new sample obs = self._env.reset() for _ in range(self._episode_max_steps): total_steps += 1 act = self._mpc(obs) next_obs, rew, done, _ = self._env.step(act) self.dynamics_buffer.add(obs=obs, act=act, next_obs=next_obs) total_rew += rew if done: break obs = next_obs tf.summary.experimental.set_step(total_steps) tf.summary.scalar("mpc/total_rew", total_rew) self.logger.info( "iter={0: 3d} total_rew: {1:4.4f} loss: {2:2.8f}".format( i, total_rew, mean_loss)) def predict_next_state(self, obses, acts): obs_diffs = np.zeros_like(obses) inputs = np.concatenate([obses, acts], axis=1) for dynamics_model in self._dynamics_models: obs_diffs += dynamics_model.predict(inputs) obs_diffs /= self._n_dynamics_model return obses + obs_diffs def _mpc(self, obs): obses = np.tile(obs, (self._n_sample, 1)) init_actions = self._policy.get_actions(obses) total_rewards = np.zeros(shape=(self._n_sample, )) for i in range(self._horizon): if i == 0: acts = init_actions else: acts = self._policy.get_actions(obses) assert obses.shape[0] == acts.shape[0] next_obses = self.predict_next_state(obses, acts) rewards = self._reward_fn(obses, acts) assert rewards.shape == total_rewards.shape total_rewards += rewards obses = next_obses idx = np.argmax(total_rewards) return init_actions[idx] def _set_from_args(self, args): super()._set_from_args(args) self._max_iter = args.max_iter self._horizon = args.horizon self._n_sample = args.n_sample self._n_random_rollout = args.n_random_rollout self._batch_size = args.batch_size def collect_episodes(self, n_rollout=1): for _ in range(n_rollout): obs = self._env.reset() for _ in range(self._episode_max_steps): act = self._policy.get_action(obs) next_obs, _, done, _ = self._env.step(act) self.dynamics_buffer.add(obs=obs, act=act, next_obs=next_obs) obs = next_obs if done: break @tf.function def _fit_dynamics_body(self, inputs, labels): losses = [] for dynamics_model, optimizer in zip(self._dynamics_models, self._optimizers): with tf.GradientTape() as tape: predicts = dynamics_model(inputs) loss = tf.reduce_mean(0.5 * tf.square(labels - predicts)) grads = tape.gradient(loss, dynamics_model.trainable_variables) optimizer.apply_gradients( zip(grads, dynamics_model.trainable_variables)) losses.append(loss) return tf.convert_to_tensor(losses) def _make_inputs_output_pairs(self, n_epoch): samples = self.dynamics_buffer.sample( self.dynamics_buffer.get_stored_size()) inputs = np.concatenate([samples["obs"], samples["act"]], axis=1) labels = samples["next_obs"] - samples["obs"] return inputs, labels def fit_dynamics(self, n_epoch=1): inputs, labels = self._make_inputs_output_pairs(n_epoch) dataset = tf.data.Dataset.from_tensor_slices((inputs, labels)) dataset = dataset.batch(self._batch_size) dataset = dataset.shuffle(buffer_size=1000) dataset = dataset.repeat(n_epoch) mean_losses = np.zeros(shape=(self._n_dynamics_model, ), dtype=np.float32) for batch, (x, y) in enumerate(dataset): _mean_losses = self._fit_dynamics_body(x, y) mean_losses += _mean_losses.numpy() mean_losses /= (batch + 1) for model_idx, mean_loss in enumerate(mean_losses): tf.summary.scalar("mpc/model_{}_loss".format(model_idx), mean_loss) return np.mean(mean_losses) @staticmethod def get_argument(parser=None): parser = Trainer.get_argument(parser) parser.add_argument('--gpu', type=int, default=0, help='GPU id') parser.add_argument("--max-iter", type=int, default=100) parser.add_argument("--horizon", type=int, default=20) parser.add_argument("--n-sample", type=int, default=1000) parser.add_argument("--n-random-rollout", type=int, default=1000) parser.add_argument("--batch-size", type=int, default=512) return parser
def explorer(global_rb, queue, trained_steps, is_training_done, lock, env_fn, policy_fn, set_weights_fn, noise_level, n_env=64, n_thread=4, buffer_size=1024, episode_max_steps=1000, gpu=0): """ Collect transitions and store them to prioritized replay buffer. :param global_rb (multiprocessing.managers.AutoProxy[PrioritizedReplayBuffer]): Prioritized replay buffer sharing with multiple explorers and only one learner. This object is shared over processes, so it must be locked when trying to operate something with `lock` object. :param queue (multiprocessing.Queue): A FIFO shared with the `learner` and `evaluator` to get the latest network weights. This is process safe, so you don't need to lock process when use this. :param trained_steps (multiprocessing.Value): Number of steps to apply gradients. :param is_training_done (multiprocessing.Event): multiprocessing.Event object to share the status of training. :param lock (multiprocessing.Lock): multiprocessing.Lock to lock other processes. :param env_fn (function): Method object to generate an environment. :param policy_fn (function): Method object to generate an explorer. :param set_weights_fn (function): Method object to set network weights gotten from queue. :param noise_level (float): Noise level for exploration. For epsilon-greedy policy like DQN variants, this will be epsilon, and if DDPG variants this will be variance for Normal distribution. :param n_env (int): Number of environments to distribute. If this is set to be more than 1, `MultiThreadEnv` will be used. :param n_thread (int): Number of thread used in `MultiThreadEnv`. :param buffer_size (int): Size of local buffer. If this is filled with transitions, add them to `global_rb` :param episode_max_steps (int): Maximum number of steps of an episode. :param gpu (int): GPU id. If this is set to -1, then this process uses only CPU. """ import_tf() logger = logging.getLogger("tf2rl") if n_env > 1: envs = MultiThreadEnv(env_fn=env_fn, batch_size=n_env, thread_pool=n_thread, max_episode_steps=episode_max_steps) env = envs._sample_env else: env = env_fn() policy = policy_fn(env=env, name="Explorer", memory_capacity=global_rb.get_buffer_size(), noise_level=noise_level, gpu=gpu) kwargs = get_default_rb_dict(buffer_size, env) if n_env > 1: kwargs["env_dict"]["priorities"] = {} local_rb = ReplayBuffer(**kwargs) if n_env == 1: s = env.reset() episode_steps = 0 total_reward = 0. total_rewards = [] else: obses = envs.py_reset() start = time.time() n_sample, n_sample_old = 0, 0 while not is_training_done.is_set(): if n_env == 1: n_sample += 1 episode_steps += 1 a = policy.get_action(s) s_, r, done, _ = env.step(a) done_flag = done if episode_steps == env._max_episode_steps: done_flag = False total_reward += r local_rb.add(obs=s, act=a, rew=r, next_obs=s_, done=done_flag) s = s_ if done or episode_steps == episode_max_steps: s = env.reset() total_rewards.append(total_reward) total_reward = 0 episode_steps = 0 else: n_sample += n_env obses = envs.py_observation() actions = policy.get_action(obses, tensor=True) next_obses, rewards, dones, _ = envs.step(actions) td_errors = policy.compute_td_error(states=obses, actions=actions, next_states=next_obses, rewards=rewards, dones=dones) local_rb.add(obs=obses, act=actions, next_obs=next_obses, rew=rewards, done=dones, priorities=np.abs(td_errors + 1e-6)) # Periodically copy weights of explorer if not queue.empty(): set_weights_fn(policy, queue.get()) # Add collected experiences to global replay buffer if local_rb.get_stored_size() == buffer_size: samples = local_rb.sample(local_rb.get_stored_size()) if n_env > 1: priorities = np.squeeze(samples["priorities"]) else: td_errors = policy.compute_td_error( states=samples["obs"], actions=samples["act"], next_states=samples["next_obs"], rewards=samples["rew"], dones=samples["done"]) priorities = np.abs(np.squeeze(td_errors)) + 1e-6 lock.acquire() global_rb.add(obs=samples["obs"], act=samples["act"], rew=samples["rew"], next_obs=samples["next_obs"], done=samples["done"], priorities=priorities) lock.release() local_rb.clear() msg = "Grad: {0: 6d}\t".format(trained_steps.value) msg += "Samples: {0: 7d}\t".format(n_sample) msg += "TDErr: {0:.5f}\t".format(np.average(priorities)) if n_env == 1: ave_rew = 0 if len(total_rewards) == 0 else \ sum(total_rewards) / len(total_rewards) msg += "AveEpiRew: {0:.3f}\t".format(ave_rew) total_rewards = [] msg += "FPS: {0:.2f}".format( (n_sample - n_sample_old) / (time.time() - start)) logger.info(msg) start = time.time() n_sample_old = n_sample
class SAC: """ Soft Actor Critic Ref: https://arxiv.org/pdf/1812.05905.pdf """ def __init__(self, observation_space, action_space, replay_size=int(1e6), gamma=0.99, tau=0.05, lr=3e-4, alpha=0.2, target_update_interval=1, device='cuda'): self.gamma = gamma self.tau = tau self.alpha = alpha self.target_update_interval = target_update_interval self.device = device self.logger = Logger() # Experience replay rb_kwargs = get_default_rb_dict(observation_space.shape, action_space.shape, replay_size) self.rb = ReplayBuffer(**rb_kwargs) # critic self.critic = CriticCNN(obs_dim=observation_space.shape[0], act_dim=action_space.shape[0]).to(self.device) self.critic_opt = Adam(self.critic.parameters(), lr=lr) # critic target self.critic_target = CriticCNN(obs_dim=observation_space.shape[0], act_dim=action_space.shape[0]).to( self.device) self.critic_target.hard_update(self.critic) # actor self.actor = ActorCNN(obs_dim=observation_space.shape[0], act_dim=action_space.shape[0], action_space=action_space).to(self.device) self.actor_opt = Adam(self.actor.parameters(), lr=lr) self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_opt = Adam([self.log_alpha], lr=lr) def select_action(self, obs, evaluate=False): obs = torch.FloatTensor(obs).to(self.device).unsqueeze(0) if evaluate is False: action, _, _ = self.actor.sample(obs) else: _, _, action = self.actor.sample(obs) return action.detach().cpu().numpy()[0] def compute_td_error(self, obs, act, next_obs, rew, done): with torch.no_grad(): next_act, next_log_prob, _ = self.actor.sample(next_obs) target_q1, target_q2 = self.critic_target(next_obs, next_act) target_q = torch.min(target_q1, target_q2) - self.alpha * next_log_prob target_q = rew + ((1 - done) * self.gamma * target_q) current_q1, current_q2 = self.critic(obs, act) td_error1 = current_q1 - target_q td_error2 = current_q2 - target_q return td_error1, td_error2 def critic_loss(self, obs, act, next_obs, rew, done): td_error1, td_error2 = self.compute_td_error(obs, act, next_obs, rew, done) # MSE loss1 = huber_loss(td_error1).mean() loss2 = huber_loss(td_error2).mean() return loss1 + loss2 def actor_alpha_loss(self, obs): act, log_prob, _ = self.actor.sample(obs) current_q1, current_q2 = self.critic(obs, act) min_q = torch.min(current_q1, current_q2) actor_loss = ((self.alpha * log_prob) - min_q).mean() # alpha loss alpha_loss = -(self.log_alpha * (log_prob + self.target_entropy).detach()).mean() return actor_loss, alpha_loss def update_critic(self, obs, act, next_obs, rew, done): loss = self.critic_loss(obs, act, next_obs, rew, done) # update q1 self.critic_opt.zero_grad() loss.backward(retain_graph=True) self.critic_opt.step() return loss def update_actor_alpha(self, obs): actor_loss, alpha_loss = self.actor_alpha_loss(obs) # update actor self.actor_opt.zero_grad() actor_loss.backward() self.actor_opt.step() # update alpha self.alpha_opt.zero_grad() alpha_loss.backward() self.alpha_opt.step() return actor_loss, alpha_loss def update_parameters(self, batch_size, updates): batch = self.rb.sample(batch_size) # to tensor obs = torch.FloatTensor(batch['obs']).to(self.device) act = torch.FloatTensor(batch['act']).to(self.device) next_obs = torch.FloatTensor(batch['next_obs']).to(self.device) rew = torch.FloatTensor(batch['rew']).to(self.device) done = torch.FloatTensor(batch['done']).to(self.device) # update actor & critic & alpha critic_loss = self.update_critic(obs, act, next_obs, rew, done) actor_loss, alpha_loss = self.update_actor_alpha(obs) # apply alpha self.alpha = self.log_alpha.exp() # update target network if updates % self.target_update_interval == 0: self.critic_target.soft_update(self.critic, self.tau) return critic_loss, actor_loss, alpha_loss, self.alpha.clone() def load_model(self, actor, critic): self.actor = actor self.critic = critic
class Agent: def __init__(self, lr, state_shape, num_actions, batch_size, max_mem_size=1000): self.lr = lr self.gamma = 0.99 self.action_space = list(range(num_actions)) self.batch_size = batch_size self.target_update_interval = 200 self.step_count = 0 self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000) self.memory = ReplayBuffer( max_mem_size, { "obs": { "shape": state_shape }, "act": { "shape": 1 }, "rew": {}, "next_obs": { "shape": state_shape }, "done": { "shape": 1 } }) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # self.device = torch.device("cpu") self.V_MIN, self.V_MAX = 0, 200 self.NUM_ATOMS = 4 self.support = torch.linspace(self.V_MIN, self.V_MAX, self.NUM_ATOMS).to(self.device) self.net = Network(lr, state_shape, num_actions, self.support, self.NUM_ATOMS).to(self.device) self.net_ = Network(lr, state_shape, num_actions, self.support, self.NUM_ATOMS).to(self.device) self.net_.load_state_dict(self.net.state_dict()) def choose_action(self, observation): if np.random.random() > self.epsilon.value(): state = torch.tensor(observation).float().detach() state = state.to(self.device) state = state.unsqueeze(0) q_values = self.net(state) action = torch.argmax(q_values).item() return action else: return np.random.choice(self.action_space) def store_memory(self, state, action, reward, next_state, done): self.memory.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done) def learn(self): if self.memory.get_stored_size() < self.batch_size: return batch = self.memory.sample(self.batch_size) states = torch.tensor(batch["obs"]).to(self.device) actions = torch.tensor(batch["act"], dtype=torch.int64).to(self.device).T[0] rewards = torch.tensor(batch["rew"]).to(self.device) states_ = torch.tensor(batch["next_obs"]).to(self.device) dones = torch.tensor(batch["done"], dtype=torch.float32).to(self.device) batch_index = np.arange(self.batch_size, dtype=np.int64) # the difference between each reward quanta delta_z = float(self.V_MAX - self.V_MIN) / (self.NUM_ATOMS - 1 ) #28.571428571428573 with torch.no_grad(): qs_ = self.net_(states_) #[64,2] actions_ = qs_.argmax(dim=1) #[64] dists_ = self.net_.dist(states_) #[64,2,8] action_dist_ = dists_[batch_index, actions_] #[64,8] # print(action_dist_) # print(action_dist_.shape) # quit() #done #[64,1] #reward #[64,1] #support #[51] print("support") print(self.support) print(self.support.shape) t_z = rewards + ( 1 - dones) * self.gamma * self.support # shape=[64,8] # t_z = torch.tensor((self.batch_size,)).to(self.device) * self.support t_z = torch.zeros( (self.batch_size, self.NUM_ATOMS)).to(self.device) tzindxs = np.arange(6) t_z[tzindxs] = self.support print("t-z") print(t_z) print(t_z.shape) # quit() # normalization bullshit t_z = t_z.clamp(min=self.V_MIN, max=self.V_MAX) b = (t_z - self.V_MIN) / delta_z # quantize l = b.floor().long() # indices u = b.ceil().long() # offsets to the closest reward bracket print(t_z) print(t_z.shape) # quit() print(b) print(b.shape) print(l) print(l.shape) print(u) print(u.shape) # quit() # this is a giant indexing array offset = ( #[64,8] #[[0..0],[8..8],[16..16],,,[504..504] torch.linspace(0, (self.batch_size - 1) * self.NUM_ATOMS, self.batch_size).long().unsqueeze(1).expand( self.batch_size, self.NUM_ATOMS).to(self.device)) print("\noffset") print(offset) print(offset.shape) frac = u.float() - b # percentages, decreasing, axis = 1 dec_frac = b - l.float() # percentages, increasing, axis = 1 # print(something_else) # print(something_else.shape) # quit() action_dist_ = torch.ones( (self.batch_size, self.NUM_ATOMS)).to(self.device) proj_dist = torch.zeros(action_dist_.size(), device=self.device) # [64,8] print("proj_dist") print(proj_dist) print(proj_dist.shape) print("action_dist_") print(action_dist_) print(action_dist_.shape) # print(frac) # print(frac.shape) print("l") print(l) print(l.shape) print("offset") print(offset) print(offset.shape) proj_dist.view(-1).index_add_( #[64,8] 0, (l + offset).view(-1), (action_dist_).view(-1) #(action_dist_ * frac).view(-1) ) print("RESULT: proj_dist") print(proj_dist) print(proj_dist.shape) proj_dist.view(-1).index_add_( #[64,8] 0, (u + offset).view(-1), (action_dist_).view(-1) #(action_dist_ * dec_frac).view(-1) ) print("proj_dist") print(proj_dist) print(proj_dist.shape) quit() # print(dec_frac) # print(dec_frac.shape) # quit() # print(actions) # print(actions.shape) # quit() dists = self.net.dist(states) #[64,2,8] log_p = torch.log(dists[batch_index, actions]) loss = -(proj_dist * log_p).sum(1).mean() self.net.optimizer.zero_grad() loss.backward() self.net.optimizer.step() self.epsilon.step() self.step_count += 1 if self.step_count % self.target_update_interval == 0: print("targnet update!!") self.net_.load_state_dict(self.net.state_dict()) return loss
time_step = env.reset() state = np.concatenate( [ time_step.observation[key] for key in list( time_step.observation.keys() ) ] ) score = 0 for t in range(int(max_t)): action = agent.get_action(state) time_step = env.step(action) reward, done = time_step.reward, time_step.last() next_state = np.concatenate( [ time_step.observation[key] for key in list( time_step.observation.keys() ) ] ) # Learn, if enough samples are available in memory if rb.get_stored_size() > BATCH_SIZE: data = rb.sample(BATCH_SIZE) states = data['obs']; actions = data['act']; rewards = data['rew'] next_states = data['next_obs']; dones = data['done'] actor_loss, critic_loss, _ = agent.train(states, actions, next_states, rewards, dones) with summary_writer.as_default(): tf.summary.scalar(name="actor_loss", data=actor_loss, step=t) tf.summary.scalar(name="critic_loss", data=critic_loss, step=t)
def explorer(global_rb, queue, trained_steps, n_transition, is_training_done, lock, env_fn, policy_fn, buffer_size=1024, max_transition=None, episode_max_steps=1000): """ Collect transitions and store them to prioritized replay buffer. Args: global_rb: Prioritized replay buffer sharing with multiple explorers and only one learner. This object is shared over processes, so it must be locked when trying to operate something with `lock` object. queue: A FIFO shared with the learner to get latest network parameters. This is process safe, so you don't need to lock process when use this. trained_steps: Number of steps to apply gradients. n_transition: Number of collected transitions. is_training_done: multiprocessing.Event object to share the status of training. lock: multiprocessing.Lock to lock other processes. You must release after process is done. env_fn: Method object to generate an environment. policy_fn: Method object to generate an explorer. buffer_size: Size of local buffer. If it is filled with transitions, add them to `global_rb` max_transition: Maximum number of steps to explorer. Default value is None. episode_max_steps: Maximum number of steps of an episode. """ env = env_fn() policy = policy_fn(env, "Explorer", global_rb.get_buffer_size()) local_rb = ReplayBuffer(obs_shape=env.observation_space.shape, act_dim=env.action_space.low.size, size=buffer_size) s = env.reset() episode_steps = 0 total_reward = 0. total_rewards = [] start = time.time() sample_at_start = 0 while not is_training_done.is_set(): # Periodically copy weights of explorer if not queue.empty(): actor_weights, critic_weights, critic_target_weights = queue.get() update_target_variables(policy.actor.weights, actor_weights, tau=1.) update_target_variables(policy.critic.weights, critic_weights, tau=1.) update_target_variables(policy.critic_target.weights, critic_target_weights, tau=1.) n_transition.value += 1 episode_steps += 1 a = policy.get_action(s) s_, r, done, _ = env.step(a) done_flag = done if episode_steps == env._max_episode_steps: done_flag = False total_reward += r local_rb.add(s, a, r, s_, done_flag) s = s_ if done or episode_steps == episode_max_steps: s = env.reset() total_rewards.append(total_reward) total_reward = 0 episode_steps = 0 # Add collected experiences to global replay buffer if local_rb.get_stored_size() == buffer_size - 1: temp_n_transition = n_transition.value samples = local_rb.sample(local_rb.get_stored_size()) states, next_states, actions, rewards, done = samples["obs"], samples["next_obs"], samples["act"], samples["rew"], samples["done"] done = np.array(done, dtype=np.float64) td_errors = policy.compute_td_error( states, actions, next_states, rewards, done) print("Grad: {0: 6d}\tSamples: {1: 7d}\tTDErr: {2:.5f}\tAveEpiRew: {3:.3f}\tFPS: {4:.2f}".format( trained_steps.value, n_transition.value, np.average(np.abs(td_errors).flatten()), sum(total_rewards) / len(total_rewards), (temp_n_transition - sample_at_start) / (time.time() - start))) total_rewards = [] lock.acquire() global_rb.add( states, actions, rewards, next_states, done, priorities=np.abs(td_errors)+1e-6) lock.release() local_rb.clear() start = time.time() sample_at_start = n_transition.value if max_transition is not None and n_transition.value >= max_transition: is_training_done.set()
if use_prioritized_rb and use_nstep_rb: kwargs["n_step"] = n_step kwargs["discount"] = policy.discount return NstepPrioritizedReplayBuffer(**kwargs) # prioritized if use_prioritized_rb: return PrioritizedReplayBuffer(**kwargs) # N-step if use_nstep_rb: kwargs["n_step"] = n_step kwargs["discount"] = policy.discount return NstepReplayBuffer(**kwargs) if isinstance(kwargs["act_dim"], tuple): kwargs["act_dim"] = kwargs["act_dim"][0] return ReplayBuffer(**kwargs) if __name__ == '__main__': from cpprb import ReplayBuffer import numpy as np rb = ReplayBuffer(obs_dim=3, act_dim=3, size=10) for i in range(10): obs_act = np.array([i for _ in range(3)], dtype=np.float64) print(obs_act) rb.add(obs=obs_act, act=obs_act, next_obs=obs_act, rew=float(i), done=False) print(rb.sample(10))