def doTestDDPG(self): np.random.seed(0) env = gym.make("Pendulum-v0") env.seed(0) ddpg_g = tf.Graph() with ddpg_g.as_default(): tf.set_random_seed(123) agent = agents[DDPG_AGENT_CONFIG["type"]]( env.observation_space, env.action_space, DDPG_AGENT_CONFIG, DDPG_MODEL_CONFIG, distributed_spec={}) reward_window = WindowStat("reward", 25) obs, actions, rewards, next_obs, dones = list(), list(), list(), list( ), list() act_count = 0 for i in range(200): ob = env.reset() done = False episode_reward = .0 while not done: action, results = agent.act( [ob], False, use_perturbed_action=False) act_count += 1 next_ob, reward, done, info = env.step(action[0]) obs.append(ob) actions.append(action[0]) rewards.append(0.1 * reward) next_obs.append(next_ob) dones.append(done) if agent.ready_to_send: agent.send_experience( obs=obs, actions=actions, rewards=rewards, dones=dones, next_obs=next_obs) if agent.ready_to_receive: batch_data = agent.receive_experience() res = agent.learn(batch_data) if DDPG_AGENT_CONFIG.get("prioritized_replay", False): agent.update_priorities( indexes=batch_data["indexes"], td_error=res["td_error"]) ob = next_ob episode_reward += reward if act_count % 1024 == 0: print("timestep:", act_count, reward_window) agent.add_episode(1) reward_window.push(episode_reward) return reward_window.stats()["reward_mean"]
def doTestPPO(self): env = gym.make("CartPole-v0") env.seed(0) ppo_g = tf.Graph() with ppo_g.as_default(): tf.set_random_seed(123) agent = agents[PPO_AGENT_CONFIG["type"]]( env.observation_space, env.action_space, PPO_AGENT_CONFIG, PPO_MODEL_CONFIG, distributed_spec={}) reward_window = WindowStat("reward", 25) obs, actions, rewards, next_obs, dones, value_preds, logits = list( ), list(), list(), list(), list(), list(), list() act_count = 0 for i in range(300): ob = env.reset() done = False episode_reward = .0 while not done: action, results = agent.act([ob], False) next_ob, reward, done, info = env.step(action[0]) act_count += 1 obs.append(ob) actions.append(action[0]) rewards.append(0.1 * reward) next_obs.append(next_ob) dones.append(done) logits.append(results["logits"][0]) value_preds.append(results["value_preds"][0]) if agent.ready_to_send: agent.send_experience( obs=obs, actions=actions, rewards=rewards, dones=dones, next_obs=next_obs, value_preds=value_preds, logits=logits) if agent.ready_to_receive: batch_data = agent.receive_experience() res = agent.learn(batch_data) ob = next_ob episode_reward += reward if act_count % 1024 == 0: print("timestep:", act_count, reward_window) reward_window.push(episode_reward) return reward_window.stats()["reward_mean"]
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self._prio_change_stats = WindowStat("reprio", 1000) self._debug_cost = 0 def add(self, obs, actions, rewards, dones, next_obs, weights, **kwargs): """See ReplayBuffer.store_effect""" super(PrioritizedReplayBuffer, self).add( obs=obs, actions=actions, rewards=rewards, dones=dones, next_obs=next_obs, **{}) if weights is None: weights = self._max_priority constant_weight = weights**self._alpha for idx in self._cover_indices: self._it_sum[idx] = constant_weight self._it_min[idx] = constant_weight else: weights = np.power(weights, self._alpha) for n, idx in enumerate(self._cover_indices): self._it_sum[idx] = weights[n] self._it_min[idx] = weights[n] def _sample_proportional(self, batch_size): res = [] sum_value = self._it_sum.sum(0, len(self)) mass = np.random.random(size=batch_size) * sum_value for i in range(batch_size): # TODO(szymon): should we ensure no repeats? idx = self._it_sum.find_prefixsum_idx(mass[i]) res.append(idx) return res def sample(self, batch_size, beta): """Sample a batch of experiences. compared to ReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. Parameters ---------- batch_size: int How many transitions to sample. beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) Returns ------- obs_batch: np.array batch of observations act_batch: np.array batch of actions executed given obs_batch rew_batch: np.array rewards received as results of executing act_batch next_obs_batch: np.array next set of observations seen after executing act_batch done_mask: np.array done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 self._num_sampled += batch_size start = time.time() idxes = self._sample_proportional(batch_size) self._debug_cost += time.time() - start sum_value = self._it_sum.sum() weights = [] p_min = self._it_min.min() / sum_value max_weight = (p_min * len(self))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / sum_value weight = (p_sample * len(self))**(-beta) weights.append(weight / max_weight) weights = np.asarray(weights) encoded_sample = self._encode_sample(idxes) encoded_sample["weights"] = weights encoded_sample["indexes"] = idxes return encoded_sample def update_priorities(self, indexes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- indexes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(indexes) == len(priorities) pvs = np.power(priorities, self._alpha).astype(np.float64) for idx, priority, pv in zip(indexes, priorities, pvs): assert priority > 0 assert 0 <= idx < len(self) delta = pv - self._it_sum[idx] self._prio_change_stats.push(delta) self._it_sum[idx] = pv self._it_min[idx] = pv self._max_priority = max(self._max_priority, np.max(priorities)) def stats(self, debug=False): parent = ReplayBuffer.stats(self, debug) if debug: parent.update(self._prio_change_stats.stats()) return parent
class ReplayBuffer(object): """Basic replay buffer. Support O(1) `add` and O(1) `sample` operations (w.r.t. each transition). The buffer is implemented as a fixed-length list where the index of insertion is reset to zero, once the list length is reached. """ def __init__(self, size): """Create the replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. """ self._maxsize = size self._next_idx = 0 self._hit_count = np.zeros(size) self._eviction_started = False self._num_added = 0 self._num_sampled = 0 self._evicted_hit_stats = WindowStat("evicted_hit", 1000) self._est_size_bytes = 0 self._extra_fields = None self._first_add = True def __len__(self): return min(self._num_added, self._maxsize) def add(self, obs, actions, rewards, dones, next_obs=None, weights=None, **kwargs): batch_size = np.shape(rewards)[0] assert batch_size < self._maxsize, "size of data added in buffer is too big at once" truncated_size = min(batch_size, self._maxsize - self._next_idx) extra_size = max(0, batch_size - (self._maxsize - self._next_idx)) if self._extra_fields is None: self._extra_fields = list(kwargs.keys()) if self._first_add: self._obs = np.zeros( shape=((self._maxsize, ) + np.shape(obs)[1:]), dtype=obs.dtype) self._actions = np.zeros( shape=((self._maxsize, ) + np.shape(actions)[1:]), dtype=actions.dtype) self._rewards = np.zeros(shape=(self._maxsize, ), dtype=np.float32) if next_obs is not None: self._next_obs = np.zeros( shape=((self._maxsize, ) + np.shape(next_obs)[1:]), dtype=next_obs.dtype) if weights is not None: self._weights = np.zeros( shape=((self._maxsize, )), dtype=np.float32) self._dones = np.zeros(shape=(self._maxsize, ), dtype=np.float32) self._extras = { name: np.zeros( shape=((self._maxsize, ) + np.shape(kwargs[name])[1:]), dtype=kwargs[name].dtype) for name in self._extra_fields } self._first_add = False self._num_added += batch_size #if self._num_added <= self._maxsize: #self._est_size_bytes += sum(sys.getsizeof(d) for d in data) self._obs[self._next_idx:self._next_idx + truncated_size] = obs[:truncated_size] self._actions[self._next_idx:self._next_idx + truncated_size] = actions[:truncated_size] self._rewards[self._next_idx:self._next_idx + truncated_size] = rewards[:truncated_size] self._dones[self._next_idx:self._next_idx + truncated_size] = dones[:truncated_size] if next_obs is not None: self._next_obs[self._next_idx:self._next_idx + truncated_size] = next_obs[:truncated_size] if weights is not None: self._weights[self._next_idx:self._next_idx + truncated_size] = weights[:truncated_size] for name in self._extras.keys(): self._extras[name][self._next_idx:self._next_idx + truncated_size] = kwargs[name][:truncated_size] if extra_size > 0: self._obs[:extra_size] = obs[truncated_size:] self._actions[:extra_size] = actions[truncated_size:] self._rewards[:extra_size] = rewards[truncated_size:] self._dones[:extra_size] = dones[truncated_size:] if next_obs is not None: self._next_obs[:extra_size] = next_obs[truncated_size:] if weights is not None: self._weights[:extra_size] = weights[truncated_size:] for name in self._extras.keys(): self._extras[name][:extra_size] = kwargs[name][truncated_size:] if self._next_idx + batch_size >= self._maxsize: self._eviction_started = True self._cover_indices = [ self._next_idx + i for i in range(truncated_size) ] if extra_size > 0: self._cover_indices += [i for i in range(extra_size)] self._next_idx = (self._next_idx + batch_size) % self._maxsize if self._eviction_started: for i in self._cover_indices: self._evicted_hit_stats.push(self._hit_count[i]) self._hit_count[i] = 0 def _encode_sample(self, idxes): idxes = np.asarray(idxes) obs = np.take(self._obs, indices=idxes, axis=0) actions = np.take(self._actions, indices=idxes, axis=0) rewards = np.take(self._rewards, indices=idxes, axis=0) next_obs = np.take(self._next_obs, indices=idxes, axis=0) dones = np.take(self._dones, indices=idxes, axis=0) batch_data = dict( obs=obs, actions=actions, rewards=rewards, dones=dones, next_obs=next_obs) return batch_data def sample(self, batch_size): """Sample a batch of experiences. Parameters ---------- batch_size: int How many transitions to sample. Returns ------- obs_batch: np.array batch of observations act_batch: np.array batch of actions executed given obs_batch rew_batch: np.array rewards received as results of executing act_batch next_obs_batch: np.array next set of observations seen after executing act_batch done_mask: np.array done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. """ idxes = np.random.randint( 0, min(self._num_added, self._maxsize) - 1, size=(batch_size, )) self._num_sampled += batch_size return self._encode_sample(idxes) def stats(self, debug=False): data = { "added_count": self._num_added, "sampled_count": self._num_sampled, "est_size_bytes": self._est_size_bytes, "num_entries": len(self), } if debug: data.update(self._evicted_hit_stats.stats()) return data
def doTestCkpt(self): trial_timestamp = time.strftime("%Y%m%d-%H%M%S") np.random.seed(0) env = gym.make("CartPole-v0") env.seed(0) dqn_g = tf.Graph() with dqn_g.as_default(): tf.set_random_seed(123) agent = agents[DQN_AGENT_CONFIG["type"]]( env.observation_space, env.action_space, DQN_AGENT_CONFIG, DQN_MODEL_CONFIG, checkpoint_dir="ckpt_dir_{}".format(trial_timestamp), distributed_spec={}) reward_window = WindowStat("reward", 50) obs, actions, rewards, next_obs, dones = list(), list(), list(), list( ), list() act_count = 0 for i in range(500): ob = env.reset() done = False episode_reward = .0 while not done: action, results = agent.act([ob], deterministic=False, use_perturbed_action=False) next_ob, reward, done, info = env.step(action[0]) act_count += 1 obs.append(ob) actions.append(action[0]) rewards.append(reward) next_obs.append(next_ob) dones.append(done) if agent.ready_to_send: agent.send_experience(obs=obs, actions=actions, rewards=rewards, next_obs=next_obs, dones=dones) if agent.ready_to_receive: batch_data = agent.receive_experience() res = agent.learn(batch_data) if DQN_AGENT_CONFIG.get("prioritized_replay", False): agent.update_priorities(indexes=batch_data["indexes"], td_error=res["td_error"]) ob = next_ob episode_reward += reward if act_count % 1024 == 0: print("timestep:", act_count, reward_window) agent.add_episode(1) reward_window.push(episode_reward) prev_perf = reward_window.stats()["reward_mean"] print("Performance before saving is {}".format(prev_perf)) new_dqn_g = tf.Graph() with new_dqn_g.as_default(): agent = agents[DQN_AGENT_CONFIG["type"]]( env.observation_space, env.action_space, DQN_AGENT_CONFIG, DQN_MODEL_CONFIG, checkpoint_dir="ckpt_dir_{}".format(trial_timestamp), distributed_spec={}) reward_window = WindowStat("reward", 10) ob = env.reset() for i in range(10): ob = env.reset() done = False episode_reward = .0 while not done: action, results = agent.act([ob], deterministic=True, use_perturbed_action=False) next_ob, reward, done, info = env.step(action[0]) act_count += 1 ob = next_ob episode_reward += reward agent.add_episode(1) reward_window.push(episode_reward) cur_perf = reward_window.stats()["reward_mean"] print("Performance after restore is {}".format(cur_perf)) return prev_perf - cur_perf
def doTestSavedModel(self): trial_timestamp = time.strftime("%Y%m%d-%H%M%S") model_dir = "model_dir_{}".format(trial_timestamp) os.system("mkdir {}".format(model_dir)) np.random.seed(0) env = gym.make("CartPole-v0") env.seed(0) dqn_g = tf.Graph() with dqn_g.as_default(): tf.set_random_seed(123) agent = agents[DQN_AGENT_CONFIG["type"]](env.observation_space, env.action_space, DQN_AGENT_CONFIG, DQN_MODEL_CONFIG, export_dir=model_dir, distributed_spec={}) reward_window = WindowStat("reward", 50) obs, actions, rewards, next_obs, dones = list(), list(), list(), list( ), list() act_count = 0 for i in range(500): ob = env.reset() done = False episode_reward = .0 while not done: action, results = agent.act([ob], deterministic=False, use_perturbed_action=False) next_ob, reward, done, info = env.step(action[0]) act_count += 1 obs.append(ob) actions.append(action[0]) rewards.append(reward) next_obs.append(next_ob) dones.append(done) if agent.ready_to_send: agent.send_experience(obs=obs, actions=actions, rewards=rewards, next_obs=next_obs, dones=dones) if agent.ready_to_receive: batch_data = agent.receive_experience() res = agent.learn(batch_data) if DQN_AGENT_CONFIG.get("prioritized_replay", False): agent.update_priorities(indexes=batch_data["indexes"], td_error=res["td_error"]) ob = next_ob episode_reward += reward if act_count % 1024 == 0: print("timestep:", act_count, reward_window) agent.add_episode(1) reward_window.push(episode_reward) prev_perf = reward_window.stats()["reward_mean"] print("Performance before saving is {}".format(prev_perf)) with tf.Session() as sess: path = model_dir MetaGraphDef = tf.saved_model.loader.load( sess, tags=[sm.tag_constants.SERVING], export_dir=path) # get SignatureDef protobuf SignatureDef_d = MetaGraphDef.signature_def SignatureDef = SignatureDef_d["predict_results"] # get inputs/outputs TensorInfo protobuf ph_inputs = {} for name, ts_info in SignatureDef.inputs.items(): ph_inputs[name] = sm.utils.get_tensor_from_tensor_info( ts_info, sess.graph) outputs = {} for name, ts_info in SignatureDef.outputs.items(): outputs[name] = sm.utils.get_tensor_from_tensor_info( ts_info, sess.graph) for name, ph in ph_inputs.items(): print(name, ph) for name, ts in outputs.items(): print(name, ts) reward_window = WindowStat("reward", 10) for i in range(10): ob = env.reset() done = False episode_reward = .0 while not done: action = sess.run(outputs["output_actions"], feed_dict={ ph_inputs["obs_ph"]: [np.asarray(ob)], ph_inputs["deterministic_ph"]: True }) next_ob, reward, done, info = env.step(action[0]) episode_reward += reward ob = next_ob reward_window.push(episode_reward) cur_perf = reward_window.stats()["reward_mean"] print("Performance after restore is {}".format(cur_perf)) return prev_perf - cur_perf