def __init__(self, size, alpha): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self._prio_change_stats = WindowStat("reprio", 1000) self._debug_cost = 0
def doTestDDPG(self): np.random.seed(0) env = gym.make("Pendulum-v0") env.seed(0) ddpg_g = tf.Graph() with ddpg_g.as_default(): tf.set_random_seed(123) agent = agents[DDPG_AGENT_CONFIG["type"]]( env.observation_space, env.action_space, DDPG_AGENT_CONFIG, DDPG_MODEL_CONFIG, distributed_spec={}) reward_window = WindowStat("reward", 25) obs, actions, rewards, next_obs, dones = list(), list(), list(), list( ), list() act_count = 0 for i in range(200): ob = env.reset() done = False episode_reward = .0 while not done: action, results = agent.act( [ob], False, use_perturbed_action=False) act_count += 1 next_ob, reward, done, info = env.step(action[0]) obs.append(ob) actions.append(action[0]) rewards.append(0.1 * reward) next_obs.append(next_ob) dones.append(done) if agent.ready_to_send: agent.send_experience( obs=obs, actions=actions, rewards=rewards, dones=dones, next_obs=next_obs) if agent.ready_to_receive: batch_data = agent.receive_experience() res = agent.learn(batch_data) if DDPG_AGENT_CONFIG.get("prioritized_replay", False): agent.update_priorities( indexes=batch_data["indexes"], td_error=res["td_error"]) ob = next_ob episode_reward += reward if act_count % 1024 == 0: print("timestep:", act_count, reward_window) agent.add_episode(1) reward_window.push(episode_reward) return reward_window.stats()["reward_mean"]
def doTestPPO(self): env = gym.make("CartPole-v0") env.seed(0) ppo_g = tf.Graph() with ppo_g.as_default(): tf.set_random_seed(123) agent = agents[PPO_AGENT_CONFIG["type"]]( env.observation_space, env.action_space, PPO_AGENT_CONFIG, PPO_MODEL_CONFIG, distributed_spec={}) reward_window = WindowStat("reward", 25) obs, actions, rewards, next_obs, dones, value_preds, logits = list( ), list(), list(), list(), list(), list(), list() act_count = 0 for i in range(300): ob = env.reset() done = False episode_reward = .0 while not done: action, results = agent.act([ob], False) next_ob, reward, done, info = env.step(action[0]) act_count += 1 obs.append(ob) actions.append(action[0]) rewards.append(0.1 * reward) next_obs.append(next_ob) dones.append(done) logits.append(results["logits"][0]) value_preds.append(results["value_preds"][0]) if agent.ready_to_send: agent.send_experience( obs=obs, actions=actions, rewards=rewards, dones=dones, next_obs=next_obs, value_preds=value_preds, logits=logits) if agent.ready_to_receive: batch_data = agent.receive_experience() res = agent.learn(batch_data) ob = next_ob episode_reward += reward if act_count % 1024 == 0: print("timestep:", act_count, reward_window) reward_window.push(episode_reward) return reward_window.stats()["reward_mean"]
def __init__(self, size): """Create the replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. """ self._maxsize = size self._next_idx = 0 self._hit_count = np.zeros(size) self._eviction_started = False self._num_added = 0 self._num_sampled = 0 self._evicted_hit_stats = WindowStat("evicted_hit", 1000) self._est_size_bytes = 0 self._extra_fields = None self._first_add = True
def main(): env = MaxComponentEnv(num_arms=6) agent_class = agents[AGENT_CONFIG["type"]] agent = agent_class(env.observation_space, env.action_space, AGENT_CONFIG, MODEL_CONFIG, distributed_spec={}, export_dir="hook_dump_dir") reward_window = WindowStat("reward", 50) length_window = WindowStat("length", 50) loss_window = WindowStat("loss", 50) obs, actions, rewards, next_obs, dones = list(), list(), list(), list( ), list() act_count = 0 for i in range(100): ob = env.reset() done = False episode_reward = .0 episode_len = 0 while not done: action, results = agent.act([ob], deterministic=False, use_perturbed_action=False) next_ob, reward, done, info = env.step(action[0]) act_count += 1 obs.append(ob) actions.append(action[0]) rewards.append(reward) next_obs.append(next_ob) dones.append(done) if agent.ready_to_send: agent.send_experience(obs=obs, actions=actions, rewards=rewards, next_obs=next_obs, dones=dones) if agent.ready_to_receive: batch_data = agent.receive_experience() res = agent.learn(batch_data) loss_window.push(res['loss']) if AGENT_CONFIG.get("prioritized_replay", False): agent.update_priorities(indexes=batch_data["indexes"], td_error=res["td_error"]) ob = next_ob episode_reward += reward episode_len += 1 if act_count % 5 == 0: print("timestep:", act_count, reward_window, length_window) agent.add_episode(1) reward_window.push(episode_reward) length_window.push(episode_len) agent.export_saved_model() print("Done.")
def main(): with open(FLAGS.config, 'r') as ips: config = json.load(ips) print(config) job_name = FLAGS.job_name env = make_atari("PongNoFrameskip-v4") env = wrap_deepmind( env=env, frame_stack=True, clip_rewards=False, episode_life=True, wrap_frame=True, frame_resize=42) agent_class = agents[config["agent"]["type"]] agent = agent_class( env.observation_space, env.action_space, config["agent"], config["model"], distributed_spec={ "ps_hosts": FLAGS.ps_hosts, "memory_hosts": FLAGS.memory_hosts, "actor_hosts": FLAGS.actor_hosts, "learner_hosts": FLAGS.learner_hosts, "job_name": FLAGS.job_name, "task_index": FLAGS.task_index }, custom_model=MyVTmodel, checkpoint_dir=None) all_cost = time.time() if job_name == "ps": print("ps starts===>") agent.join() elif job_name == "memory": start_tt = time.time() log_count = 0 print("memory starts===>") while not agent.should_stop(): agent.communicate() if time.time() - start_tt > log_count: log_count += 1 print(agent._receive_count, "actor2mem_q:", agent._actor2mem_q.qsize(), "mem2learner_2:", agent._mem2learner_q.qsize()) sys.stdout.flush() elif job_name == "actor": print("actor starts===>") start_tt = time.time() log_count = 0 act_log_count = 0 # create vectorized env def make_env(rank): def make_atari_env(): env = make_atari("PongNoFrameskip-v4") env = wrap_deepmind( env=env, frame_stack=True, clip_rewards=False, episode_life=True, wrap_frame=True, frame_resize=42) env.seed(rank) return env return make_atari_env num_env = config["agent"].get("num_env", 1) vec_env = VectorizedEnvironment( make_env=make_env, num_env=num_env, seed=100 * FLAGS.task_index) act_count = 0 reward_window = WindowStat("reward", 10) length_window = WindowStat("length", 10) obs, actions, rewards, dones, logits = list(), list(), list(), list( ), list() agent.sync_vars() while not agent.should_stop(): ob = vec_env.reset() episode_reward = np.zeros(num_env, ) episode_len = np.zeros(num_env, ) while not agent.should_stop(): action, results = agent.act(ob, False) act_count += 1 new_ob, reward, done, info = vec_env.step(action) obs.append(ob) actions.append(action) rewards.append(reward) dones.append(done) logits.append(results["logits"]) if agent.ready_to_send: agent.send_experience( obs=obs, actions=actions, rewards=rewards, dones=dones, logits=logits, vec_env=True, num_env=num_env) agent.sync_vars() ob = new_ob episode_reward += np.asarray(reward) episode_len += 1 for i in range(num_env): if done[i]: reward_window.push(episode_reward[i]) length_window.push(episode_len[i]) episode_reward[i] = .0 episode_len[i] = 0 total_cost = time.time() - start_tt if int(total_cost / 5) > log_count: log_count += 1 print("act_count:", act_count, "actor2mem_q:", agent._actor2mem_q.qsize(), "total:", total_cost) print('total_cost:', total_cost, reward_window) print(length_window) sys.stdout.flush() if int((act_count * num_env) / 10000) > act_log_count: act_log_count += 1 print('timestep:', act_log_count * 10000, reward_window) elif job_name == "learner": print("learner starts===>") start_tt = time.time() train_count = 0 try: while not agent.should_stop(): batch_data = agent.receive_experience() if batch_data: extra_data = agent.learn(batch_data) train_count += 1 print("learning {}".format(extra_data), "receive_q:", agent._receive_q.qsize()) print("train_count:", train_count, "total:", time.time() - start_tt) sys.stdout.flush() except tf.errors.OutOfRangeError as e: print("memory has stopped.") else: raise ValueError("Invalid job_name.") all_cost = time.time() - all_cost print("done. all_cost:", all_cost)
def main(): env = gym.make("CartPole-v0") env.seed(0) agent_class = agents[AGENT_CONFIG["type"]] agent = agent_class(env.observation_space, env.action_space, AGENT_CONFIG, MODEL_CONFIG, distributed_spec={}, checkpoint_dir="ckpt_dir", custom_model=MyESmodel) reward_window = WindowStat("reward", 50) length_window = WindowStat("length", 50) init_perturbation_scale = 0.1 seeds, rewards, perturbation_scales = list(), list(), list() is_positive_direction = list() # how many episodes needed for one trial episode_per_perturbation = 1 returns = list() for i in range(4000): ob = env.reset() done = False episode_reward = .0 episode_len = 0 if i % episode_per_perturbation == 0: # perturb parameters every `episode_per_seed` episodes is_positive = True if len( is_positive_direction ) == 0 else is_positive_direction[-1] != True # each seed twice seed = np.random.randint(1000000) if is_positive else seeds[-1] perturbation_scale = max( init_perturbation_scale * (1 - i / 2000.0), 0.02) feed = agent.model.perturbation_feed fetch = [agent.model.reset_perturbation_op] agent.executor.run(fetches=fetch, feed_dict={ feed['perturbation_seeds']: [seed], feed['perturbation_scales']: [perturbation_scale], feed['positive_perturbation']: is_positive }) if is_positive: seeds.append(seed) perturbation_scales.append(perturbation_scale) is_positive_direction.append(is_positive) while not done: action, result = agent.act([ob], True, use_perturbed_action=True) next_ob, reward, done, info = env.step(action[0]) ob = next_ob episode_reward += reward episode_len += 1 rewards.append(episode_reward) reward_window.push(episode_reward) length_window.push(episode_len) if len(rewards) == episode_per_perturbation: returns.append(np.mean(rewards)) rewards = [] if len(returns) == 2 * agent.config.get('sample_batch_size', 100): print(reward_window) assert len(seeds) == (len(returns) / 2) assert len(perturbation_scales) == (len(returns) / 2) agent.learn( batch_data=dict(perturbation_seeds=seeds, perturbation_scales=perturbation_scales, returns=np.reshape(returns, [-1, 2]))) seeds = [] perturbation_scales = [] returns = [] is_positive_direction = [] # evaluation 20 episodes test_rewards = list() for j in range(20): done = False ob = env.reset() episode_reward = 0 episode_len = 0 while not done: action, result = agent.act([ob], True, use_perturbed_action=False) next_ob, reward, done, info = env.step(action[0]) ob = next_ob episode_reward += reward episode_len += 1 test_rewards.append(episode_reward) print("[evaluation] average reward of 20 episodes:", np.mean(test_rewards)) print('train at ', i) agent.export_saved_model(export_dir="dump_dir") print("Done.")
def main(_): with open(FLAGS.config, 'r') as ips: config = json.load(ips) print(config) env = gym.make("CartPole-v0") env.seed(0) agent_class = agents[config["agent"]["type"]] agent = agent_class(env.observation_space, env.action_space, config["agent"], config["model"], distributed_spec={ "ps_hosts": FLAGS.ps_hosts, "memory_hosts": FLAGS.memory_hosts, "actor_hosts": FLAGS.actor_hosts, "learner_hosts": FLAGS.learner_hosts, "job_name": FLAGS.job_name, "task_index": FLAGS.task_index }, custom_model=MyDQN) if FLAGS.job_name == "ps": print("ps starts===>") agent.join() elif FLAGS.job_name == "memory": print("memory starts===>") while not agent.should_stop(): agent.communicate() sys.stdout.flush() elif FLAGS.job_name == "actor": print("actor starts===>") reward_window = WindowStat("reward", 50) length_window = WindowStat("length", 50) obs, actions, rewards, new_obs, dones = list(), list(), list(), list( ), list() agent.sync_vars() while not agent.should_stop(): ob = env.reset() done = False episode_reward = .0 episode_len = 0 while not done and not agent.should_stop(): action, results = agent.act([ob], False) new_ob, reward, done, info = env.step(action[0]) obs.append(ob) actions.append(action[0]) rewards.append(reward) new_obs.append(new_ob) dones.append(done) if agent.ready_to_send: agent.send_experience(obs=obs, actions=actions, rewards=rewards, next_obs=new_obs, dones=dones) agent.sync_vars() ob = new_ob episode_reward += reward episode_len += 1 reward_window.push(episode_reward) length_window.push(episode_len) print(reward_window) print(length_window) sys.stdout.flush() elif FLAGS.job_name == "learner": print("learner starts===>") while not agent.should_stop(): batch_data = agent.receive_experience() if batch_data: extra_data = agent.learn(batch_data) print("learning {}".format(extra_data)) sys.stdout.flush() else: raise ValueError("Invalid job_name.") print("done.")
def main(): env = make_atari("PongNoFrameskip-v4") env = wrap_deepmind(env=env, frame_stack=True, clip_rewards=False, episode_life=True, wrap_frame=True, frame_resize=42) agent_class = agents[AGENT_CONFIG["type"]] agent = agent_class(env.observation_space, env.action_space, AGENT_CONFIG, MODEL_CONFIG, distributed_spec={}, custom_model=MyDQNModel) reward_window = WindowStat("reward", 10) length_window = WindowStat("length", 10) loss_window = WindowStat("loss", 10) obs, actions, rewards, next_obs, dones = list(), list(), list(), list( ), list() for i in range(2000): ob = env.reset() ob = np.asarray(ob) done = False episode_reward = .0 episode_len = 0 while not done: action, results = agent.act([ob], deterministic=False, use_perturbed_action=False) next_ob, reward, done, info = env.step(action[0]) next_ob = np.asarray(next_ob) obs.append(ob) actions.append(action[0]) rewards.append(reward) next_obs.append(next_ob) dones.append(done) if agent.ready_to_send: agent.send_experience(obs=obs, actions=actions, rewards=rewards, next_obs=next_obs, dones=dones) if agent.ready_to_receive: batch_data = agent.receive_experience() res = agent.learn(batch_data) loss_window.push(res['loss']) if AGENT_CONFIG.get("prioritized_replay", False): agent.update_priorities(indexes=batch_data["indexes"], td_error=res["td_error"]) ob = next_ob episode_reward += reward episode_len += 1 agent.add_episode(1) reward_window.push(episode_reward) length_window.push(episode_len) if i % 10 == 0: print('episode at', i) print(reward_window) print(length_window) print(loss_window) print("Done.")
def main(): with open(FLAGS.config, 'r') as ips: config = json.load(ips) print(config) env = gym.make("CartPole-v0") env.seed(0) agent_class = agents[config["agent"]["type"]] agent = agent_class(env.observation_space, env.action_space, agent_config=config["agent"], model_config=config["model"], distributed_spec={ "ps_hosts": FLAGS.ps_hosts, "worker_hosts": FLAGS.worker_hosts, "job_name": FLAGS.job_name, "task_index": FLAGS.task_index }, custom_model=MyPGModel, checkpoint_dir="") all_cost = time.time() if FLAGS.job_name == "ps": print("ps starts===>") agent.join() elif FLAGS.job_name == "worker": print("actor starts===>") act_count = 0 train_count = 0 reward_window = WindowStat("reward", 50) length_window = WindowStat("length", 50) obs, actions, rewards, dones, value_preds = list(), list(), list( ), list(), list() while not agent.should_stop(): ob = env.reset() done = False episode_reward = 0.0 episode_len = 0.0 while not done and not agent.should_stop(): action, results = agent.act([ob], False) act_count += 1 new_ob, reward, done, info = env.step(action[0]) obs.append(ob) actions.append(action[0]) rewards.append(0.1 * reward) dones.append(done) value_preds.append(results["value_preds"][0]) if agent.ready_to_send: agent.send_experience(obs=obs, actions=actions, rewards=rewards, dones=dones, value_preds=value_preds) batch_data = agent.receive_experience() if batch_data: extra_data = agent.learn(batch_data) train_count += 1 print("learning {}".format(extra_data)) ob = new_ob episode_reward += reward episode_len += 1 print("act_count:", act_count) reward_window.push(episode_reward) length_window.push(episode_len) print(reward_window) print(length_window) sys.stdout.flush() if FLAGS.task_index == 0: agent.export_saved_model(export_dir="a3c_export_dir") print("export savedmodel finish") else: raise ValueError("Invalid job_name.") all_cost = time.time() - all_cost print("done. all_cost:", all_cost)
def doTestCkpt(self): trial_timestamp = time.strftime("%Y%m%d-%H%M%S") np.random.seed(0) env = gym.make("CartPole-v0") env.seed(0) dqn_g = tf.Graph() with dqn_g.as_default(): tf.set_random_seed(123) agent = agents[DQN_AGENT_CONFIG["type"]]( env.observation_space, env.action_space, DQN_AGENT_CONFIG, DQN_MODEL_CONFIG, checkpoint_dir="ckpt_dir_{}".format(trial_timestamp), distributed_spec={}) reward_window = WindowStat("reward", 50) obs, actions, rewards, next_obs, dones = list(), list(), list(), list( ), list() act_count = 0 for i in range(500): ob = env.reset() done = False episode_reward = .0 while not done: action, results = agent.act([ob], deterministic=False, use_perturbed_action=False) next_ob, reward, done, info = env.step(action[0]) act_count += 1 obs.append(ob) actions.append(action[0]) rewards.append(reward) next_obs.append(next_ob) dones.append(done) if agent.ready_to_send: agent.send_experience(obs=obs, actions=actions, rewards=rewards, next_obs=next_obs, dones=dones) if agent.ready_to_receive: batch_data = agent.receive_experience() res = agent.learn(batch_data) if DQN_AGENT_CONFIG.get("prioritized_replay", False): agent.update_priorities(indexes=batch_data["indexes"], td_error=res["td_error"]) ob = next_ob episode_reward += reward if act_count % 1024 == 0: print("timestep:", act_count, reward_window) agent.add_episode(1) reward_window.push(episode_reward) prev_perf = reward_window.stats()["reward_mean"] print("Performance before saving is {}".format(prev_perf)) new_dqn_g = tf.Graph() with new_dqn_g.as_default(): agent = agents[DQN_AGENT_CONFIG["type"]]( env.observation_space, env.action_space, DQN_AGENT_CONFIG, DQN_MODEL_CONFIG, checkpoint_dir="ckpt_dir_{}".format(trial_timestamp), distributed_spec={}) reward_window = WindowStat("reward", 10) ob = env.reset() for i in range(10): ob = env.reset() done = False episode_reward = .0 while not done: action, results = agent.act([ob], deterministic=True, use_perturbed_action=False) next_ob, reward, done, info = env.step(action[0]) act_count += 1 ob = next_ob episode_reward += reward agent.add_episode(1) reward_window.push(episode_reward) cur_perf = reward_window.stats()["reward_mean"] print("Performance after restore is {}".format(cur_perf)) return prev_perf - cur_perf
def main(): # create offline_env env = offline_env(FLAGS.train_data_file, batch_size=128, n_step=MODEL_CONFIG.get("n_step", 1)) eval_env = offline_env(FLAGS.eval_data_file, batch_size=128, n_step=MODEL_CONFIG.get("n_step", 1)) agent_class = agents[AGENT_CONFIG["type"]] agent = agent_class( env.observation_space, env.action_space, AGENT_CONFIG, MODEL_CONFIG, distributed_spec={}, export_dir="bcq_tmp", checkpoint_dir="bcq_tmp", custom_model=MyBCQModel) clone_loss_window = WindowStat("clone_loss", 50) clone_reg_loss_window = WindowStat("clone_reg_loss", 50) loss_window = WindowStat("loss", 50) total_cost = time.time() clone_learn_count = 0 # first, train a generative model by behavior clone for i in range(1000): table_data = env.reset() # store raw data in replay buffer agent.send_experience( obs=table_data["obs"], actions=table_data["actions"], rewards=table_data["rewards"], dones=table_data["dones"], next_obs=table_data["next_obs"]) # sample from replay buffer # the size of sampled data is equal to `AGENT_CONFIG["batch_size"]` batch_data = agent.receive_experience() clone_loss, clone_reg_loss = agent.behavior_learn(batch_data=batch_data) clone_learn_count += 1 clone_loss_window.push(clone_loss) clone_reg_loss_window.push(clone_reg_loss) if i % 50 == 0: print(clone_loss_window) print(clone_reg_loss_window) # eval generative model all_clone_act, gd_act = [], [] for i in range(100): table_data = env.reset() clone_act = agent.behavior_act(table_data["obs"]) all_clone_act.extend(np.argsort(-1.0 * clone_act, axis=1).tolist()) gd_act.extend(table_data["actions"]) acc1 = np.sum(np.array(all_clone_act)[:, 0] == np.array(gd_act))*1.0/len(gd_act) print("acc @top1:", acc1) # second, train bcq agent.reset_global_step() epochs_to_end = 10 max_globel_steps_to_end = 10000 learn_count = 0 env.num_epoch = 0 while env.num_epoch < epochs_to_end and learn_count < max_globel_steps_to_end: table_data = env.reset() # store raw data in replay buffer agent.send_experience( obs=table_data["obs"], actions=table_data["actions"], rewards=table_data["rewards"], dones=table_data["dones"], next_obs=table_data["next_obs"]) # sample from replay buffer # the size of sampled data is equal to `AGENT_CONFIG["batch_size"]` batch_data = agent.receive_experience() # update the model res = agent.learn(batch_data) # record the loss loss_window.push(res["loss"]) learn_count += 1 if AGENT_CONFIG.get("prioritized_replay", False): # update priorities agent.update_priorities( indexes=batch_data["indexes"], td_error=res["td_error"]) if learn_count % 50 == 0: print("learn_count:", learn_count) print(loss_window) # offline evaluation batch_weights = [] batch_rewards = [] eval_num = 50 for _ in range(eval_num): batch_data = eval_env.reset() importance_ratio = agent.importance_ratio(batch_data) batch_weights.append(importance_ratio) batch_rewards.append(batch_data["rewards"]) ips, ips_sw, wips, wips_sw, wips_sw_mean = ips_eval( batch_weights=batch_weights, batch_rewards=batch_rewards, gamma=MODEL_CONFIG.get("gamma", 0.95)) agent.add_extra_summary({agent.model.ips_score_op:ips, agent.model.ips_score_stepwise_op:ips_sw, agent.model.wnorm_ips_score_op:wips, agent.model.wnorm_ips_score_stepwise_op:wips_sw, agent.model.wnorm_ips_score_stepwise_mean_op:wips_sw_mean}) print("[IPS Policy Evaluation @learn_count={}] ips={}, ips_stepwise={}, wnorm_ips={}, wnorm_ips_stepwise={}, wnorm_ips_stepwise_mean={}".format( learn_count, ips, ips_sw, wips, wips_sw, wips_sw_mean)) if learn_count % 2000 == 0: # export saved model at any time # AssertionError will occur if the export_dir already exists. agent.export_saved_model("bcq_export_dir{}".format(learn_count)) if learn_count % 200 == 0: # test with simulator gym_env = gym.make("CartPole-v0") for ix in range(10): ob = gym_env.reset() done = False episode_reward = .0 while not done: action, results = agent.act( [ob], deterministic=False, use_perturbed_action=False) next_ob, reward, done, info = gym_env.step(action[0]) episode_reward += reward ob = next_ob print("train@", learn_count, "test@", ix, "reward:", episode_reward) print("Done.", "num_epoch:", env.num_epoch, "learn_count:", learn_count, "total_cost:", time.time() - total_cost)
def main(): env = gym.make("CartPole-v0") agent_class = agents[AGENT_CONFIG["type"]] agent = agent_class(env.observation_space, env.action_space, AGENT_CONFIG, MODEL_CONFIG, distributed_spec={}, checkpoint_dir="ckpt_dir", export_dir="dump_dir") reward_window = WindowStat("reward", 50) length_window = WindowStat("length", 50) loss_window = WindowStat("loss", 50) obs, actions, rewards, dones, next_obs, logits = list(), list(), list( ), list(), list(), list() act_count = 0 for i in range(500): ob = env.reset() done = False episode_reward = .0 episode_len = 0 while not done: action, results = agent.act([ob], False) next_ob, reward, done, info = env.step(action[0]) act_count += 1 obs.append(ob) actions.append(action[0]) rewards.append(reward) dones.append(done) next_obs.append(next_ob) logits.append(results["logits"][0]) if agent.ready_to_send: agent.send_experience(obs=obs, actions=actions, rewards=rewards, next_obs=next_obs, dones=dones, logits=logits) if agent.ready_to_receive: batch_data = agent.receive_experience() res = agent.learn(batch_data) loss_window.push(res["loss"]) ob = next_ob episode_reward += reward episode_len += 1 if act_count % 1000 == 0: print("timestep:", act_count, reward_window, length_window) reward_window.push(episode_reward) length_window.push(episode_len) agent.export_saved_model() print("Done.")
def main(): with open(FLAGS.config, 'r') as ips: config = json.load(ips) print(config) env = gym.make("CartPole-v0") env.seed(0) agent_class = agents[config["agent"]["type"]] agent = agent_class(env.observation_space, env.action_space, config["agent"], config["model"], distributed_spec={ "ps_hosts": FLAGS.ps_hosts, "memory_hosts": FLAGS.memory_hosts, "actor_hosts": FLAGS.actor_hosts, "learner_hosts": FLAGS.learner_hosts, "job_name": FLAGS.job_name, "task_index": FLAGS.task_index }, custom_model=MyESmodel, checkpoint_dir=None) all_cost = time.time() if FLAGS.job_name == "ps": print("ps starts===>") agent.join() elif FLAGS.job_name == "memory": print("memory starts===>") while not agent.should_stop(): agent.communicate() print("communicating") time.sleep(0.1) elif FLAGS.job_name == "actor": print("actor starts===>") reward_window = WindowStat("reward", 50) length_window = WindowStat("length", 50) perturbation_scale = 0.1 run_episode_per_perturbation = config["agent"].get( "run_episode_per_perturbation", 1) seeds, rewards, perturbation_scales = list(), list(), list() is_positive = False returns = list() agent.sync_vars() episode_count = 0 try: while not agent.should_stop(): # do perturbation is_positive = False if is_positive else True # each seed will be used twice seed = np.random.randint(1000000) if is_positive else seeds[-1] perturbation_scale = max( perturbation_scale * (1 - episode_count / 2000.0), 0.02) feed = agent.behavior_model.perturbation_feed fetch = [agent.behavior_model.reset_perturbation_op] agent.executor.run(fetches=fetch, feed_dict={ feed['perturbation_seeds']: [seed], feed['perturbation_scales']: [perturbation_scale], feed['positive_perturbation']: is_positive }) if is_positive: seeds.append(seed) perturbation_scales.append(perturbation_scale) rewards, episode_lens = rollout( agent, env, episode_num=run_episode_per_perturbation, use_perturbed_action=True) episode_count += run_episode_per_perturbation # calculate the average reward from a specific perturbation with one direction if len(returns) == 0: returns.append([np.mean(rewards)]) elif len(returns[-1]) < 2: returns[-1].append(np.mean(rewards)) else: returns.append([np.mean(rewards)]) if len(returns) == agent.config.get( 'sample_batch_size', 100) and len(returns[-1]) == 2: # send out the results for the latest `sample_batch_size` * 2 trials print(reward_window) assert len(seeds) == len(returns) assert len(perturbation_scales) == len(returns) agent.send_experience( **dict(perturbation_seeds=seeds, perturbation_scales=perturbation_scales, returns=returns)) # reset the direction is_positive = False # synchronize the weights from parameter server to local behavior_model agent.sync_vars() # do evaluation for 20 episode evaluation_num = 20 evl_returns, _ = rollout(agent, env, episode_num=evaluation_num, use_perturbed_action=False) print( "evaluation at episode:", episode_count, ",avg episode reward of {} evaluation:".format( evaluation_num), np.mean(evl_returns)) reward_window.push(rewards) length_window.push(episode_lens) if episode_count % 50 == 0: print(reward_window) print(length_window) sys.stdout.flush() except tf.errors.OutOfRangeError as e: print("memory has stopped.") elif FLAGS.job_name == "learner": print("learner starts===>") train_count = 0 try: while not agent.should_stop(): batch_data = agent.receive_experience() if batch_data: extra_data = agent.learn(batch_data) train_count += 1 print("learning {}".format(extra_data)) sys.stdout.flush() except tf.errors.OutOfRangeError as e: print("memory has stopped.") else: raise ValueError("Invalid job_name.") all_cost = time.time() - all_cost print("done. all_cost:", all_cost)
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self._prio_change_stats = WindowStat("reprio", 1000) self._debug_cost = 0 def add(self, obs, actions, rewards, dones, next_obs, weights, **kwargs): """See ReplayBuffer.store_effect""" super(PrioritizedReplayBuffer, self).add( obs=obs, actions=actions, rewards=rewards, dones=dones, next_obs=next_obs, **{}) if weights is None: weights = self._max_priority constant_weight = weights**self._alpha for idx in self._cover_indices: self._it_sum[idx] = constant_weight self._it_min[idx] = constant_weight else: weights = np.power(weights, self._alpha) for n, idx in enumerate(self._cover_indices): self._it_sum[idx] = weights[n] self._it_min[idx] = weights[n] def _sample_proportional(self, batch_size): res = [] sum_value = self._it_sum.sum(0, len(self)) mass = np.random.random(size=batch_size) * sum_value for i in range(batch_size): # TODO(szymon): should we ensure no repeats? idx = self._it_sum.find_prefixsum_idx(mass[i]) res.append(idx) return res def sample(self, batch_size, beta): """Sample a batch of experiences. compared to ReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. Parameters ---------- batch_size: int How many transitions to sample. beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) Returns ------- obs_batch: np.array batch of observations act_batch: np.array batch of actions executed given obs_batch rew_batch: np.array rewards received as results of executing act_batch next_obs_batch: np.array next set of observations seen after executing act_batch done_mask: np.array done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 self._num_sampled += batch_size start = time.time() idxes = self._sample_proportional(batch_size) self._debug_cost += time.time() - start sum_value = self._it_sum.sum() weights = [] p_min = self._it_min.min() / sum_value max_weight = (p_min * len(self))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / sum_value weight = (p_sample * len(self))**(-beta) weights.append(weight / max_weight) weights = np.asarray(weights) encoded_sample = self._encode_sample(idxes) encoded_sample["weights"] = weights encoded_sample["indexes"] = idxes return encoded_sample def update_priorities(self, indexes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- indexes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(indexes) == len(priorities) pvs = np.power(priorities, self._alpha).astype(np.float64) for idx, priority, pv in zip(indexes, priorities, pvs): assert priority > 0 assert 0 <= idx < len(self) delta = pv - self._it_sum[idx] self._prio_change_stats.push(delta) self._it_sum[idx] = pv self._it_min[idx] = pv self._max_priority = max(self._max_priority, np.max(priorities)) def stats(self, debug=False): parent = ReplayBuffer.stats(self, debug) if debug: parent.update(self._prio_change_stats.stats()) return parent
class ReplayBuffer(object): """Basic replay buffer. Support O(1) `add` and O(1) `sample` operations (w.r.t. each transition). The buffer is implemented as a fixed-length list where the index of insertion is reset to zero, once the list length is reached. """ def __init__(self, size): """Create the replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. """ self._maxsize = size self._next_idx = 0 self._hit_count = np.zeros(size) self._eviction_started = False self._num_added = 0 self._num_sampled = 0 self._evicted_hit_stats = WindowStat("evicted_hit", 1000) self._est_size_bytes = 0 self._extra_fields = None self._first_add = True def __len__(self): return min(self._num_added, self._maxsize) def add(self, obs, actions, rewards, dones, next_obs=None, weights=None, **kwargs): batch_size = np.shape(rewards)[0] assert batch_size < self._maxsize, "size of data added in buffer is too big at once" truncated_size = min(batch_size, self._maxsize - self._next_idx) extra_size = max(0, batch_size - (self._maxsize - self._next_idx)) if self._extra_fields is None: self._extra_fields = list(kwargs.keys()) if self._first_add: self._obs = np.zeros( shape=((self._maxsize, ) + np.shape(obs)[1:]), dtype=obs.dtype) self._actions = np.zeros( shape=((self._maxsize, ) + np.shape(actions)[1:]), dtype=actions.dtype) self._rewards = np.zeros(shape=(self._maxsize, ), dtype=np.float32) if next_obs is not None: self._next_obs = np.zeros( shape=((self._maxsize, ) + np.shape(next_obs)[1:]), dtype=next_obs.dtype) if weights is not None: self._weights = np.zeros( shape=((self._maxsize, )), dtype=np.float32) self._dones = np.zeros(shape=(self._maxsize, ), dtype=np.float32) self._extras = { name: np.zeros( shape=((self._maxsize, ) + np.shape(kwargs[name])[1:]), dtype=kwargs[name].dtype) for name in self._extra_fields } self._first_add = False self._num_added += batch_size #if self._num_added <= self._maxsize: #self._est_size_bytes += sum(sys.getsizeof(d) for d in data) self._obs[self._next_idx:self._next_idx + truncated_size] = obs[:truncated_size] self._actions[self._next_idx:self._next_idx + truncated_size] = actions[:truncated_size] self._rewards[self._next_idx:self._next_idx + truncated_size] = rewards[:truncated_size] self._dones[self._next_idx:self._next_idx + truncated_size] = dones[:truncated_size] if next_obs is not None: self._next_obs[self._next_idx:self._next_idx + truncated_size] = next_obs[:truncated_size] if weights is not None: self._weights[self._next_idx:self._next_idx + truncated_size] = weights[:truncated_size] for name in self._extras.keys(): self._extras[name][self._next_idx:self._next_idx + truncated_size] = kwargs[name][:truncated_size] if extra_size > 0: self._obs[:extra_size] = obs[truncated_size:] self._actions[:extra_size] = actions[truncated_size:] self._rewards[:extra_size] = rewards[truncated_size:] self._dones[:extra_size] = dones[truncated_size:] if next_obs is not None: self._next_obs[:extra_size] = next_obs[truncated_size:] if weights is not None: self._weights[:extra_size] = weights[truncated_size:] for name in self._extras.keys(): self._extras[name][:extra_size] = kwargs[name][truncated_size:] if self._next_idx + batch_size >= self._maxsize: self._eviction_started = True self._cover_indices = [ self._next_idx + i for i in range(truncated_size) ] if extra_size > 0: self._cover_indices += [i for i in range(extra_size)] self._next_idx = (self._next_idx + batch_size) % self._maxsize if self._eviction_started: for i in self._cover_indices: self._evicted_hit_stats.push(self._hit_count[i]) self._hit_count[i] = 0 def _encode_sample(self, idxes): idxes = np.asarray(idxes) obs = np.take(self._obs, indices=idxes, axis=0) actions = np.take(self._actions, indices=idxes, axis=0) rewards = np.take(self._rewards, indices=idxes, axis=0) next_obs = np.take(self._next_obs, indices=idxes, axis=0) dones = np.take(self._dones, indices=idxes, axis=0) batch_data = dict( obs=obs, actions=actions, rewards=rewards, dones=dones, next_obs=next_obs) return batch_data def sample(self, batch_size): """Sample a batch of experiences. Parameters ---------- batch_size: int How many transitions to sample. Returns ------- obs_batch: np.array batch of observations act_batch: np.array batch of actions executed given obs_batch rew_batch: np.array rewards received as results of executing act_batch next_obs_batch: np.array next set of observations seen after executing act_batch done_mask: np.array done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. """ idxes = np.random.randint( 0, min(self._num_added, self._maxsize) - 1, size=(batch_size, )) self._num_sampled += batch_size return self._encode_sample(idxes) def stats(self, debug=False): data = { "added_count": self._num_added, "sampled_count": self._num_sampled, "est_size_bytes": self._est_size_bytes, "num_entries": len(self), } if debug: data.update(self._evicted_hit_stats.stats()) return data
def main(): gym_env = gym.make("CartPole-v0") atari_env = make_atari("PongNoFrameskip-v4") atari_env = wrap_deepmind(env=atari_env, frame_stack=True, clip_rewards=False, episode_life=True, wrap_frame=True, frame_resize=42) # replace the following env according to your saved_model # env = atari_env env = gym_env with tf.Session() as sess: path = 'dump_dir' MetaGraphDef = tf.saved_model.loader.load( sess, tags=[sm.tag_constants.SERVING], export_dir=path) # get SignatureDef protobuf SignatureDef_d = MetaGraphDef.signature_def SignatureDef = SignatureDef_d["predict_results"] # get inputs/outputs TensorInfo protobuf ph_inputs = {} for name, ts_info in SignatureDef.inputs.items(): ph_inputs[name] = sm.utils.get_tensor_from_tensor_info( ts_info, sess.graph) outputs = {} for name, ts_info in SignatureDef.outputs.items(): outputs[name] = sm.utils.get_tensor_from_tensor_info( ts_info, sess.graph) for name, ph in ph_inputs.items(): print(name, ph) for name, ts in outputs.items(): print(name, ts) len_window = WindowStat("length", 50) reward_window = WindowStat("reward", 50) for i in range(100): ob = env.reset() env.render() time.sleep(0.2) done = False episode_len = 0 episode_reward = .0 while not done: action = sess.run(outputs["output_actions"], feed_dict={ ph_inputs["obs_ph"]: [np.asarray(ob)], ph_inputs["deterministic_ph"]: True }) next_ob, reward, done, info = env.step(action[0]) env.render() time.sleep(0.1) episode_reward += reward episode_len += 1 ob = next_ob len_window.push(episode_len) reward_window.push(episode_reward) print(reward_window) print(len_window)
def main(): env = gym.make("Pendulum-v0") env.seed(0) agent_class = agents[AGENT_CONFIG["type"]] agent = agent_class(env.observation_space, env.action_space, AGENT_CONFIG, MODEL_CONFIG, distributed_spec={}, checkpoint_dir="ckpt_dir", export_dir="dump_dir", custom_model=MyDDPG) reward_window = WindowStat("reward", 50) length_window = WindowStat("length", 50) loss_window = WindowStat("loss", 50) actor_loss = WindowStat("actor_loss", 50) obs, actions, rewards, next_obs, dones = list(), list(), list(), list( ), list() act_count = 0 train_count = 0 total_cost = time.time() for i in range(500): ob = env.reset() done = False episode_reward = .0 episode_len = 0 while not done: action, results = agent.act([ob], False, use_perturbed_action=False) act_count += 1 next_ob, reward, done, info = env.step(action[0]) obs.append(ob) actions.append(action[0]) rewards.append(0.1 * reward) next_obs.append(next_ob) dones.append(done) if agent.ready_to_send: agent.send_experience(obs=obs, actions=actions, rewards=rewards, dones=dones, next_obs=next_obs) if agent.ready_to_receive: batch_data = agent.receive_experience() res = agent.learn(batch_data) loss_window.push(res["critic_loss"]) actor_loss.push(res["actor_loss"]) train_count += 1 if AGENT_CONFIG.get("prioritized_replay", False): agent.update_priorities(indexes=batch_data["indexes"], td_error=res["td_error"]) ob = next_ob episode_reward += reward episode_len += 1 agent.add_episode(1) reward_window.push(episode_reward) length_window.push(episode_len) if act_count % 200 == 0: print("timestep:", act_count, reward_window, loss_window, actor_loss) agent.export_saved_model() print("Done.", "act_count:", act_count, "train_count:", train_count, "total_cost:", time.time() - total_cost)
def doTestES(self): np.random.seed(0) env = gym.make("CartPole-v0") env.seed(0) es_g = tf.Graph() with es_g.as_default(): tf.set_random_seed(123) agent = agents[ES_AGENT_CONFIG["type"]]( env.observation_space, env.action_space, ES_AGENT_CONFIG, ES_MODEL_CONFIG, distributed_spec={}, custom_model=MyESmodel) reward_window = WindowStat("reward", 25) perturbation_scale = 0.1 seeds, rewards, perturbation_scales = list(), list(), list() is_positive_direction = list() episode_per_perturbation = 1 returns = list() for i in range(5000): ob = env.reset() done = False episode_reward = .0 if i % episode_per_perturbation == 0: # perturb parameters every `episode_per_seed` episodes is_positive = True if len( is_positive_direction ) == 0 else is_positive_direction[-1] != True # each seed twice seed = np.random.randint(1000000) if is_positive else seeds[-1] perturbation_scale = max(perturbation_scale * (1 - i / 2000.0), 0.02) feed = agent.model.perturbation_feed fetch = [agent.model.reset_perturbation_op] agent.executor.run( fetches=fetch, feed_dict={ feed['perturbation_seeds']: [seed], feed['perturbation_scales']: [perturbation_scale], feed['positive_perturbation']: is_positive }) if is_positive: seeds.append(seed) perturbation_scales.append(perturbation_scale) is_positive_direction.append(is_positive) while not done: action, result = agent.act( [ob], True, use_perturbed_action=True) next_ob, reward, done, info = env.step(action[0]) ob = next_ob episode_reward += reward rewards.append(episode_reward) reward_window.push(episode_reward) if len(rewards) == episode_per_perturbation: returns.append(np.mean(rewards)) rewards = [] if len(returns) == 2 * agent.config.get( 'sample_batch_size', 100): print(reward_window) assert len(seeds) == (len(returns) / 2) assert len(perturbation_scales) == (len(returns) / 2) agent.learn( batch_data=dict( perturbation_seeds=seeds, perturbation_scales=perturbation_scales, returns=np.reshape(returns, [-1, 2]))) seeds = [] perturbation_scales = [] returns = [] is_positive_direction = [] # evaluation 20 episodes test_rewards = list() for j in range(10): done = False ob = env.reset() episode_reward = 0 while not done: action, result = agent.act( [ob], True, use_perturbed_action=False) next_ob, reward, done, info = env.step(action[0]) ob = next_ob episode_reward += reward test_rewards.append(episode_reward) print("[evaluation] average reward of 20 episodes:", np.mean(test_rewards)) print('train at ', i) return np.mean(test_rewards)
def main(): with open(FLAGS.config, 'r') as ips: config = json.load(ips) print(config) env = gym.make("CartPole-v0") env.seed(0) agent_class = agents[config["agent"]["type"]] agent = agent_class(env.observation_space, env.action_space, agent_config=config["agent"], model_config=config["model"], distributed_spec={ "ps_hosts": FLAGS.ps_hosts, "memory_hosts": FLAGS.memory_hosts, "actor_hosts": FLAGS.actor_hosts, "learner_hosts": FLAGS.learner_hosts, "job_name": FLAGS.job_name, "task_index": FLAGS.task_index }, custom_model=MyPPOModel, checkpoint_dir=None) all_cost = time.time() if FLAGS.job_name == "ps": print("ps starts===>") agent.join() elif FLAGS.job_name == "memory": print("memory starts===>") while not agent.should_stop(): agent.communicate() print(agent._receive_count, "actor2mem_q:", agent._actor2mem_q.qsize(), "mem2learner_q:", agent._mem2learner_q.qsize()) sys.stdout.flush() time.sleep(0.1) elif FLAGS.job_name == "actor": print("actor starts===>") act_count = 0 reward_window = WindowStat("reward", 50) length_window = WindowStat("length", 50) obs, actions, rewards, dones, value_preds, logits = list(), list( ), list(), list(), list(), list() agent.sync_vars() while not agent.should_stop(): ob = env.reset() done = False episode_reward = .0 episode_len = 0 print("begin an episode.") sys.stdout.flush() while not done and not agent.should_stop(): action, results = agent.act([ob], False) act_count += 1 new_ob, reward, done, info = env.step(action[0]) obs.append(ob) actions.append(action[0]) rewards.append(0.1 * reward) dones.append(done) logits.append(results["logits"][0]) value_preds.append(results["value_preds"][0]) if agent.ready_to_send: print("to send exp.") sys.stdout.flush() agent.send_experience(obs=obs, actions=actions, rewards=rewards, dones=dones, logits=logits, value_preds=value_preds) agent.sync_vars() print("sent") sys.stdout.flush() ob = new_ob episode_reward += reward episode_len += 1 print("act_count:", act_count) reward_window.push(episode_reward) length_window.push(episode_len) print(reward_window) print(length_window) sys.stdout.flush() elif FLAGS.job_name == "learner": print("learner starts===>") train_count = 0 while not agent.should_stop(): batch_data = agent.receive_experience() if batch_data: extra_data = agent.learn(batch_data) train_count += 1 print("learning {}".format(extra_data)) sys.stdout.flush() else: raise ValueError("Invalid job_name.") all_cost = time.time() - all_cost print("done. all_cost:", all_cost)
def doTestSavedModel(self): trial_timestamp = time.strftime("%Y%m%d-%H%M%S") model_dir = "model_dir_{}".format(trial_timestamp) os.system("mkdir {}".format(model_dir)) np.random.seed(0) env = gym.make("CartPole-v0") env.seed(0) dqn_g = tf.Graph() with dqn_g.as_default(): tf.set_random_seed(123) agent = agents[DQN_AGENT_CONFIG["type"]](env.observation_space, env.action_space, DQN_AGENT_CONFIG, DQN_MODEL_CONFIG, export_dir=model_dir, distributed_spec={}) reward_window = WindowStat("reward", 50) obs, actions, rewards, next_obs, dones = list(), list(), list(), list( ), list() act_count = 0 for i in range(500): ob = env.reset() done = False episode_reward = .0 while not done: action, results = agent.act([ob], deterministic=False, use_perturbed_action=False) next_ob, reward, done, info = env.step(action[0]) act_count += 1 obs.append(ob) actions.append(action[0]) rewards.append(reward) next_obs.append(next_ob) dones.append(done) if agent.ready_to_send: agent.send_experience(obs=obs, actions=actions, rewards=rewards, next_obs=next_obs, dones=dones) if agent.ready_to_receive: batch_data = agent.receive_experience() res = agent.learn(batch_data) if DQN_AGENT_CONFIG.get("prioritized_replay", False): agent.update_priorities(indexes=batch_data["indexes"], td_error=res["td_error"]) ob = next_ob episode_reward += reward if act_count % 1024 == 0: print("timestep:", act_count, reward_window) agent.add_episode(1) reward_window.push(episode_reward) prev_perf = reward_window.stats()["reward_mean"] print("Performance before saving is {}".format(prev_perf)) with tf.Session() as sess: path = model_dir MetaGraphDef = tf.saved_model.loader.load( sess, tags=[sm.tag_constants.SERVING], export_dir=path) # get SignatureDef protobuf SignatureDef_d = MetaGraphDef.signature_def SignatureDef = SignatureDef_d["predict_results"] # get inputs/outputs TensorInfo protobuf ph_inputs = {} for name, ts_info in SignatureDef.inputs.items(): ph_inputs[name] = sm.utils.get_tensor_from_tensor_info( ts_info, sess.graph) outputs = {} for name, ts_info in SignatureDef.outputs.items(): outputs[name] = sm.utils.get_tensor_from_tensor_info( ts_info, sess.graph) for name, ph in ph_inputs.items(): print(name, ph) for name, ts in outputs.items(): print(name, ts) reward_window = WindowStat("reward", 10) for i in range(10): ob = env.reset() done = False episode_reward = .0 while not done: action = sess.run(outputs["output_actions"], feed_dict={ ph_inputs["obs_ph"]: [np.asarray(ob)], ph_inputs["deterministic_ph"]: True }) next_ob, reward, done, info = env.step(action[0]) episode_reward += reward ob = next_ob reward_window.push(episode_reward) cur_perf = reward_window.stats()["reward_mean"] print("Performance after restore is {}".format(cur_perf)) return prev_perf - cur_perf