# run this from the terminal and make sure you are loading appropriate environment variables # $ echo $LD_LIBRARY_PATH import gym from tf_rl.common.monitor import Monitor import environments.register as register video_dir = "./video/" temp = 5 env = gym.make("CentipedeSix-v1") env = Monitor(env, video_dir, force=True) for ep in range(10): if ep % temp == 0: print("recording") env.record_start() env.reset() done = False while not done: # env.render() action = env.action_space.sample() s, r, done, info = env.step(action) # take a random action if ep % temp == 0: env.record_end()
all_distances, all_rewards, all_actions = list(), list(), list() distance_func = get_distance( agent.params.env_name) # create the distance measure func print("=== Evaluation Mode ===") for ep in range(params.n_trial): env.record_start() obs = env.reset() state = obs["flat_obs"] done = False episode_reward = 0 while not done: action = agent.eval_predict(state) # action = env.action_space.sample() # scale for execution in env (in DDPG, every action is clipped between [-1, 1] in agent.predict) obs, reward, done, info = env.step(action * env.action_space.high) # print(action, reward) next_flat_state, next_graph_state = obs["flat_obs"], obs["graph_obs"] distance = distance_func(action, reward, info) all_actions.append(action.mean()**2) # Mean Squared of action values all_distances.append(distance) state = next_flat_state episode_reward += reward all_rewards.append(episode_reward) tf.contrib.summary.scalar("Evaluation Score", episode_reward, step=agent.index_timestep) print("| Ep: {}/{} | Score: {} |".format(ep + 1, params.n_trial, episode_reward)) env.record_end()
# for summary purpose, we put all codes in this context with tf.contrib.summary.always_record_summaries(): for i in itertools.count(): state = env.reset() total_reward = 0 start = time.time() done = False episode_len = 0 while not done: if global_timestep.numpy() < agent.params.learning_start: action = env.action_space.sample() else: action = agent.predict(state) # scale for execution in env (in DDPG, every action is clipped between [-1, 1] in agent.predict) next_state, reward, done, info = env.step( action * env.action_space.high) replay_buffer.add(state, action, reward, next_state, done) """ === Update the models """ if global_timestep.numpy() > agent.params.learning_start: states, actions, rewards, next_states, dones = replay_buffer.sample( agent.params.batch_size) loss = agent.update(states, actions, rewards, next_states, dones) soft_target_model_update_eager( agent.target_actor, agent.actor, tau=agent.params.soft_update_tau) soft_target_model_update_eager( agent.target_critic,
import gym from tf_rl.common.monitor import Monitor ENVS = [ "Ant-v2", "HalfCheetah-v2", "Hopper-v2", "Humanoid-v2", # "Reacher-v2", # "Swimmer-v2", "Walker2d-v2" ] DEFAULT = 250 for env_name in ENVS: env = gym.make(env_name) env = Monitor(env, "./video/{}".format(env_name), force=True) print(env_name) env.record_start() env.reset() done = False while not done: # env.render(mode="human", annotation_flg=False) s, r, done, i = env.step(env.action_space.sample()) env.record_end() env.close()
import gym from tf_rl.common.monitor import Monitor env = gym.make('CartPole-v0') env = Monitor(env=env, directory="./video/cartpole", force=True) for ep in range(20): if ep == 0: env.record_start() state = env.reset() for t in range(1000): action = env.action_space.sample() state, reward, done, info = env.step(action) if done: if ep == 0: env.record_end() break env.close()