def make_atari_env(): env = make_atari("PongNoFrameskip-v4") env = wrap_deepmind(env=env, frame_stack=True, clip_rewards=False, episode_life=True, wrap_frame=True, frame_resize=42) return env
def main(): env = make_atari("PongNoFrameskip-v4") env = wrap_deepmind(env=env, frame_stack=True, clip_rewards=False, episode_life=True, wrap_frame=True, frame_resize=42) agent_class = agents[AGENT_CONFIG["type"]] agent = agent_class(env.observation_space, env.action_space, AGENT_CONFIG, MODEL_CONFIG, distributed_spec={}, custom_model=MyDQNModel) reward_window = WindowStat("reward", 10) length_window = WindowStat("length", 10) loss_window = WindowStat("loss", 10) obs, actions, rewards, next_obs, dones = list(), list(), list(), list( ), list() for i in range(2000): ob = env.reset() ob = np.asarray(ob) done = False episode_reward = .0 episode_len = 0 while not done: action, results = agent.act([ob], deterministic=False, use_perturbed_action=False) next_ob, reward, done, info = env.step(action[0]) next_ob = np.asarray(next_ob) obs.append(ob) actions.append(action[0]) rewards.append(reward) next_obs.append(next_ob) dones.append(done) if agent.ready_to_send: agent.send_experience(obs=obs, actions=actions, rewards=rewards, next_obs=next_obs, dones=dones) if agent.ready_to_receive: batch_data = agent.receive_experience() res = agent.learn(batch_data) loss_window.push(res['loss']) if AGENT_CONFIG.get("prioritized_replay", False): agent.update_priorities(indexes=batch_data["indexes"], td_error=res["td_error"]) ob = next_ob episode_reward += reward episode_len += 1 agent.add_episode(1) reward_window.push(episode_reward) length_window.push(episode_len) if i % 10 == 0: print('episode at', i) print(reward_window) print(length_window) print(loss_window) print("Done.")
def main(): with open(FLAGS.config, 'r') as ips: config = json.load(ips) print(config) job_name = FLAGS.job_name env = make_atari("PongNoFrameskip-v4") env = wrap_deepmind( env=env, frame_stack=True, clip_rewards=False, episode_life=True, wrap_frame=True, frame_resize=42) agent_class = agents[config["agent"]["type"]] agent = agent_class( env.observation_space, env.action_space, config["agent"], config["model"], distributed_spec={ "ps_hosts": FLAGS.ps_hosts, "memory_hosts": FLAGS.memory_hosts, "actor_hosts": FLAGS.actor_hosts, "learner_hosts": FLAGS.learner_hosts, "job_name": FLAGS.job_name, "task_index": FLAGS.task_index }, custom_model=MyVTmodel, checkpoint_dir=None) all_cost = time.time() if job_name == "ps": print("ps starts===>") agent.join() elif job_name == "memory": start_tt = time.time() log_count = 0 print("memory starts===>") while not agent.should_stop(): agent.communicate() if time.time() - start_tt > log_count: log_count += 1 print(agent._receive_count, "actor2mem_q:", agent._actor2mem_q.qsize(), "mem2learner_2:", agent._mem2learner_q.qsize()) sys.stdout.flush() elif job_name == "actor": print("actor starts===>") start_tt = time.time() log_count = 0 act_log_count = 0 # create vectorized env def make_env(rank): def make_atari_env(): env = make_atari("PongNoFrameskip-v4") env = wrap_deepmind( env=env, frame_stack=True, clip_rewards=False, episode_life=True, wrap_frame=True, frame_resize=42) env.seed(rank) return env return make_atari_env num_env = config["agent"].get("num_env", 1) vec_env = VectorizedEnvironment( make_env=make_env, num_env=num_env, seed=100 * FLAGS.task_index) act_count = 0 reward_window = WindowStat("reward", 10) length_window = WindowStat("length", 10) obs, actions, rewards, dones, logits = list(), list(), list(), list( ), list() agent.sync_vars() while not agent.should_stop(): ob = vec_env.reset() episode_reward = np.zeros(num_env, ) episode_len = np.zeros(num_env, ) while not agent.should_stop(): action, results = agent.act(ob, False) act_count += 1 new_ob, reward, done, info = vec_env.step(action) obs.append(ob) actions.append(action) rewards.append(reward) dones.append(done) logits.append(results["logits"]) if agent.ready_to_send: agent.send_experience( obs=obs, actions=actions, rewards=rewards, dones=dones, logits=logits, vec_env=True, num_env=num_env) agent.sync_vars() ob = new_ob episode_reward += np.asarray(reward) episode_len += 1 for i in range(num_env): if done[i]: reward_window.push(episode_reward[i]) length_window.push(episode_len[i]) episode_reward[i] = .0 episode_len[i] = 0 total_cost = time.time() - start_tt if int(total_cost / 5) > log_count: log_count += 1 print("act_count:", act_count, "actor2mem_q:", agent._actor2mem_q.qsize(), "total:", total_cost) print('total_cost:', total_cost, reward_window) print(length_window) sys.stdout.flush() if int((act_count * num_env) / 10000) > act_log_count: act_log_count += 1 print('timestep:', act_log_count * 10000, reward_window) elif job_name == "learner": print("learner starts===>") start_tt = time.time() train_count = 0 try: while not agent.should_stop(): batch_data = agent.receive_experience() if batch_data: extra_data = agent.learn(batch_data) train_count += 1 print("learning {}".format(extra_data), "receive_q:", agent._receive_q.qsize()) print("train_count:", train_count, "total:", time.time() - start_tt) sys.stdout.flush() except tf.errors.OutOfRangeError as e: print("memory has stopped.") else: raise ValueError("Invalid job_name.") all_cost = time.time() - all_cost print("done. all_cost:", all_cost)
def main(): gym_env = gym.make("CartPole-v0") atari_env = make_atari("PongNoFrameskip-v4") atari_env = wrap_deepmind(env=atari_env, frame_stack=True, clip_rewards=False, episode_life=True, wrap_frame=True, frame_resize=42) # replace the following env according to your saved_model # env = atari_env env = gym_env with tf.Session() as sess: path = 'dump_dir' MetaGraphDef = tf.saved_model.loader.load( sess, tags=[sm.tag_constants.SERVING], export_dir=path) # get SignatureDef protobuf SignatureDef_d = MetaGraphDef.signature_def SignatureDef = SignatureDef_d["predict_results"] # get inputs/outputs TensorInfo protobuf ph_inputs = {} for name, ts_info in SignatureDef.inputs.items(): ph_inputs[name] = sm.utils.get_tensor_from_tensor_info( ts_info, sess.graph) outputs = {} for name, ts_info in SignatureDef.outputs.items(): outputs[name] = sm.utils.get_tensor_from_tensor_info( ts_info, sess.graph) for name, ph in ph_inputs.items(): print(name, ph) for name, ts in outputs.items(): print(name, ts) len_window = WindowStat("length", 50) reward_window = WindowStat("reward", 50) for i in range(100): ob = env.reset() env.render() time.sleep(0.2) done = False episode_len = 0 episode_reward = .0 while not done: action = sess.run(outputs["output_actions"], feed_dict={ ph_inputs["obs_ph"]: [np.asarray(ob)], ph_inputs["deterministic_ph"]: True }) next_ob, reward, done, info = env.step(action[0]) env.render() time.sleep(0.1) episode_reward += reward episode_len += 1 ob = next_ob len_window.push(episode_len) reward_window.push(episode_reward) print(reward_window) print(len_window)