def test_video_callable_records_videos(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(env, temp) env.reset() env.close() results = monitoring.load_results(temp) assert len(results['videos']) == 1, "Videos: {}".format(results['videos'])
def test_monitor_filename(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(env, directory=temp) env.close() manifests = glob.glob(os.path.join(temp, '*.manifest.*')) assert len(manifests) == 1
def test_video_callable_false_does_not_record(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(env, temp, video_callable=False) env.reset() env.close() results = monitoring.load_results(temp) assert len(results['videos']) == 0
class OpenAIGym(Environment): def __init__(self, gym_id, monitor=None, monitor_safe=False, monitor_video=0): """ Initialize OpenAI Gym. Args: gym_id: OpenAI Gym environment ID. See https://gym.openai.com/envs monitor: Output directory. Setting this to None disables monitoring. monitor_safe: Setting this to True prevents existing log files to be overwritten. Default False. monitor_video: Save a video every monitor_video steps. Setting this to 0 disables recording of videos. """ self.gym_id = gym_id self.gym = gym.make(gym_id) # Might raise gym.error.UnregisteredEnv or gym.error.DeprecatedEnv if monitor: if monitor_video == 0: video_callable = False else: video_callable = (lambda x: x % monitor_video == 0) self.gym = Monitor(self.gym, monitor, force=not monitor_safe, video_callable=video_callable) def __str__(self): return 'OpenAIGym({})'.format(self.gym_id) def close(self): self.gym = None def reset(self): return self.gym.reset() def execute(self, action): if isinstance(self.gym.action_space, gym.spaces.Box): action = [action] # some gym environments expect a list (f.i. Pendulum-v0) state, reward, terminal, _ = self.gym.step(action) return state, reward, terminal @property def states(self): if isinstance(self.gym.observation_space, Discrete): return dict(shape=(), type='float') else: return dict(shape=tuple(self.gym.observation_space.shape), type='float') @property def actions(self): if isinstance(self.gym.action_space, Discrete): return dict(continuous=False, num_actions=self.gym.action_space.n) elif len(self.gym.action_space.shape) == 1: return dict(continuous=True) elif len(self.gym.action_space.shape) > 1: return {'action' + str(n): dict(continuous=True) for n in range(len(self.gym.action_space.shape))} else: raise TensorForceError() def monitor(self, path): self.gym = Monitor(self.gym, path)
class GymEnvironment(VideoCapableEnvironment): """ Wraps an Open AI Gym environment """ def __init__(self, env_name, state_builder=ALEStateBuilder(), repeat_action=4, no_op=30, monitoring_path=None): assert isinstance(state_builder, StateBuilder), 'state_builder should inherit from StateBuilder' assert isinstance(repeat_action, (int, tuple)), 'repeat_action should be int or tuple' if isinstance(repeat_action, int): assert repeat_action >= 1, "repeat_action should be >= 1" elif isinstance(repeat_action, tuple): assert len(repeat_action) == 2, 'repeat_action should be a length-2 tuple: (min frameskip, max frameskip)' assert repeat_action[0] < repeat_action[1], 'repeat_action[0] should be < repeat_action[1]' super(GymEnvironment, self).__init__() self._state_builder = state_builder self._env = gym.make(env_name) self._env.env.frameskip = repeat_action self._no_op = max(0, no_op) self._done = True if monitoring_path is not None: self._env = Monitor(self._env, monitoring_path, video_callable=need_record) @property def available_actions(self): return self._env.action_space.n @property def state(self): return None if self._state is None else self._state_builder(self._state) @property def lives(self): return self._env.env.ale.lives() @property def frame(self): return Image.fromarray(self._state) def do(self, action): self._state, self._reward, self._done, _ = self._env.step(action) self._score += self._reward return self.state, self._reward, self._done def reset(self): super(GymEnvironment, self).reset() self._state = self._env.reset() # Random number of initial no-op to introduce stochasticity if self._no_op > 0: for _ in six.moves.range(np.random.randint(1, self._no_op)): self._state, _, _, _ = self._env.step(0) return self.state
class GymEnvironment(Environment): def __init__(self, env_id, directory=None, force=True, monitor_video=0): super(GymEnvironment, self).__init__(env_id=env_id) self._env = gym.make(env_id) if directory: if monitor_video == 0: video_callable = False else: video_callable = (lambda x: x % monitor_video == 0) self._env = Monitor(self._env, directory, video_callable=video_callable, force=force) def __str__(self): return 'OpenAIGym({})'.format(self._env_id) def close(self): if not self._closed: self._env.close() self._closed = True def reset(self, return_spec=True): self._reset() state = self._env.reset() if return_spec: return EnvSpec(action=None, state=None, reward=0, done=False, next_state=state) return state def step(self, action, state, return_spec=True): self._step() if isinstance(action, (list, np.ndarray)): if isinstance(self._env.action_space, Discrete) or isinstance(action, (list, np.ndarray)): action = action[0] if isinstance(self._env.action_space, Box) and not isinstance(action, (list, np.ndarray)): action = list(action) next_state, reward, done, _ = self._env.step(action) if return_spec: return EnvSpec( action=action, state=state, reward=reward, done=done, next_state=next_state) return next_state, reward, done @property def num_states(self): return self._env.observation_space.shape[0] @property def num_actions(self): if isinstance(self._env.action_space, Box): return self._env.action_space.shape[0] else: return self._env.action_space.n @property def is_continuous(self): return not isinstance(self._env.action_space, Discrete)
def test_write_upon_reset_false(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(env, directory=temp, video_callable=False, write_upon_reset=False) env.reset() files = glob.glob(os.path.join(temp, '*')) assert not files, "Files: {}".format(files) env.close() files = glob.glob(os.path.join(temp, '*')) assert len(files) > 0
def __init__(self, env, policy_net, summary_writer, saver=None): self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos") self.video_dir = os.path.abspath(self.video_dir) self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True) self.global_policy_net = policy_net self.summary_writer = summary_writer self.saver = saver self.sp = StateProcessor() self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model")) try: os.makedirs(self.video_dir) except FileExistsError: pass # Local policy net with tf.variable_scope("policy_eval"): self.policy_net = PolicyEstimator(policy_net.num_outputs) # Op to copy params from global policy/value net parameters self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES))
def test_only_complete_episodes_written(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(env, temp, video_callable=False) env.reset() d = False while not d: _, _, d, _ = env.step(env.action_space.sample()) env.reset() env.step(env.action_space.sample()) env.close() # Only 1 episode should be written results = monitoring.load_results(temp) assert len(results['episode_lengths']) == 1, "Found {} episodes written; expecting 1".format(len(results['episode_lengths']))
def get_new_env(env_name, cmdl): """Configure the training environment and return an instance.""" import logging import gym import gym_fast_envs # noqa from gym.wrappers import Monitor # Undo the default logger and configure a new one. gym.undo_logger_setup() logger = logging.getLogger() logger.setLevel(logging.WARNING) # Configure environment outdir = '/tmp/nec/%s-results' % cmdl.label env = gym.make(env_name) env = Monitor(env, directory=outdir, force=True, video_callable=False) env.seed(cmdl.seed) return env
def test_semisuper_succeeds(): """Regression test. Ensure that this can write""" with helpers.tempdir() as temp: env = gym.make('SemisuperPendulumDecay-v0') env = Monitor(env, temp) env.reset() env.step(env.action_space.sample()) env.close()
def __init__(self, env_id, directory=None, force=True, monitor_video=0): super(GymEnvironment, self).__init__(env_id=env_id) self._env = gym.make(env_id) if directory: if monitor_video == 0: video_callable = False else: video_callable = (lambda x: x % monitor_video == 0) self._env = Monitor(self._env, directory, video_callable=video_callable, force=force)
def test_env_reuse(): with helpers.tempdir() as temp: env = gym.make('Autoreset-v0') env = Monitor(env, temp) env.reset() _, _, done, _ = env.step(None) assert not done _, _, done, _ = env.step(None) assert done _, _, done, _ = env.step(None) assert not done _, _, done, _ = env.step(None) assert done env.close()
def cart_pole_with_qlearning(): from gym.wrappers import Monitor env = gym.make('CartPole-v0') experiment_filename = './cartpole-experiment-1' env = Monitor(env, experiment_filename, force=True) observation = env.reset() goal_average_steps = 195 max_number_of_steps = 200 number_of_iterations_to_average = 100 number_of_features = env.observation_space.shape[0] last_time_steps = np.ndarray(0) cart_position_bins = pd.cut([-2.4, 2.4], bins=10, retbins=True)[1][1:-1] pole_angle_bins = pd.cut([-2, 2], bins=10, retbins=True)[1][1:-1] cart_velocity_bins = pd.cut([-1, 1], bins=10, retbins=True)[1][1:-1] angle_rate_bins = pd.cut([-3.5, 3.5], bins=10, retbins=True)[1][1:-1] learner = QLearner(state_discretization=Binning([[-2.4, 2.4], [-2, 2], [-1., 1], [-3.5, 3.5]], [10] * 4), discrete_actions=[i for i in range(env.action_space.n)], alpha=0.2, gamma=1, random_action_rate=0.5, random_action_decay_rate=0.99) for episode in range(50000): action = learner.set_initial_state(observation) for step in range(max_number_of_steps - 1): observation, reward, done, info = env.step(action) if done: reward = -200 observation = env.reset() action = learner.move(observation, reward) if done: last_time_steps = np.append(last_time_steps, [int(step + 1)]) if len(last_time_steps) > number_of_iterations_to_average: last_time_steps = np.delete(last_time_steps, 0) break if last_time_steps.mean() > goal_average_steps: print "Goal reached!" print "Episodes before solve: ", episode + 1 print u"Best 100-episode performance {} {} {}".format(last_time_steps.max(), unichr(177), # plus minus sign last_time_steps.std()) break env.close()
def test_write_upon_reset_true(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') # TODO: Fix Cartpole to not configure itself automatically # assert not env._configured env = Monitor(env, directory=temp, video_callable=False, write_upon_reset=True) env.configure() env.reset() files = glob.glob(os.path.join(temp, '*')) assert len(files) > 0, "Files: {}".format(files) env.close() files = glob.glob(os.path.join(temp, '*')) assert len(files) > 0
def test_steps_limit_restart(): with helpers.tempdir() as temp: env = gym.make('test.StepsLimitCartpole-v0') env = Monitor(env, temp, video_callable=False) env.reset() # Episode has started _, _, done, info = env.step(env.action_space.sample()) assert done == False # Limit reached, now we get a done signal and the env resets itself _, _, done, info = env.step(env.action_space.sample()) assert done == True assert env.episode_id == 1 env.close()
def __init__(self, env_name, state_builder=ALEStateBuilder(), repeat_action=4, no_op=30, monitoring_path=None): assert isinstance(state_builder, StateBuilder), 'state_builder should inherit from StateBuilder' assert isinstance(repeat_action, (int, tuple)), 'repeat_action should be int or tuple' if isinstance(repeat_action, int): assert repeat_action >= 1, "repeat_action should be >= 1" elif isinstance(repeat_action, tuple): assert len(repeat_action) == 2, 'repeat_action should be a length-2 tuple: (min frameskip, max frameskip)' assert repeat_action[0] < repeat_action[1], 'repeat_action[0] should be < repeat_action[1]' super(GymEnvironment, self).__init__() self._state_builder = state_builder self._env = gym.make(env_name) self._env.env.frameskip = repeat_action self._no_op = max(0, no_op) self._done = True if monitoring_path is not None: self._env = Monitor(self._env, monitoring_path, video_callable=need_record)
def evaluate(self, n_games=1, save_path="./records", use_monitor=True, record_video=True, verbose=True, t_max=100000): """Plays an entire game start to end, records the logs(and possibly mp4 video), returns reward. :param save_path: where to save the report :param record_video: if True, records mp4 video :return: total reward (scalar) """ env = self.make_env() if not use_monitor and record_video: raise warn("Cannot video without gym monitor. If you still want video, set use_monitor to True") if record_video : env = Monitor(env,save_path,force=True) elif use_monitor: env = Monitor(env, save_path, video_callable=lambda i: False, force=True) game_rewards = [] for _ in range(n_games): # initial observation observation = env.reset() # initial memory prev_memories = [np.zeros((1,) + tuple(mem.output_shape[1:]), dtype=get_layer_dtype(mem)) for mem in self.agent.agent_states] t = 0 total_reward = 0 while True: res = self.agent_step(self.preprocess_observation(observation)[None, ...], *prev_memories) action, new_memories = res[0], res[1:] observation, reward, done, info = env.step(action[0]) total_reward += reward prev_memories = new_memories if done or t >= t_max: if verbose: print("Episode finished after {} timesteps with reward={}".format(t + 1, total_reward)) break t += 1 game_rewards.append(total_reward) env.close() del env return game_rewards
def __init__(self, gym_id, monitor=None, monitor_safe=False, monitor_video=0): """ Initialize OpenAI Gym. Args: gym_id: OpenAI Gym environment ID. See https://gym.openai.com/envs monitor: Output directory. Setting this to None disables monitoring. monitor_safe: Setting this to True prevents existing log files to be overwritten. Default False. monitor_video: Save a video every monitor_video steps. Setting this to 0 disables recording of videos. """ self.gym_id = gym_id self.gym = gym.make(gym_id) # Might raise gym.error.UnregisteredEnv or gym.error.DeprecatedEnv if monitor: if monitor_video == 0: video_callable = False else: video_callable = (lambda x: x % monitor_video == 0) self.gym = Monitor(self.gym, monitor, force=not monitor_safe, video_callable=video_callable)
updateTargetNetwork = 10000 explorationRate = 1 minibatch_size = 64 learnStart = 64 learningRate = 0.00025 discountFactor = 0.99 memorySize = 1000000 network_inputs = 100 network_outputs = 21 network_structure = [300, 300] current_epoch = 0 deepQ = DeepQ(network_inputs, network_outputs, memorySize, discountFactor, learningRate, learnStart) deepQ.initNetworks(network_structure) env = Monitor(env, directory=outdir, force=True, write_upon_reset=True) else: #Load weights, monitor info and parameter info. #ADD TRY CATCH fro this else with open(params_json) as outfile: d = json.load(outfile) epochs = d.get('epochs') steps = d.get('steps') updateTargetNetwork = d.get('updateTargetNetwork') explorationRate = d.get('explorationRate') minibatch_size = d.get('minibatch_size') learnStart = d.get('learnStart') learningRate = d.get('learningRate') discountFactor = d.get('discountFactor') memorySize = d.get('memorySize') network_inputs = d.get('network_inputs')
def test_no_monitor_reset_unless_done(): def assert_reset_raises(env): errored = False try: env.reset() except error.Error: errored = True assert errored, "Env allowed a reset when it shouldn't have" with helpers.tempdir() as temp: # Make sure we can reset as we please without monitor env = gym.make('CartPole-v0') env.reset() env.step(env.action_space.sample()) env.step(env.action_space.sample()) env.reset() # can reset once as soon as we start env = Monitor(env, temp, video_callable=False) env.reset() # can reset multiple times in a row env.reset() env.reset() env.step(env.action_space.sample()) env.step(env.action_space.sample()) assert_reset_raises(env) # should allow resets after the episode is done d = False while not d: _, _, d, _ = env.step(env.action_space.sample()) env.reset() env.reset() env.step(env.action_space.sample()) assert_reset_raises(env) env.close()
class EnvWrapper(object): """ Small wrapper for gym atari environments. Responsible for preprocessing screens and holding on to a screen buffer of size buffer_size from which environment state is constructed. """ def __init__(self, gym_env, buffer_size, video_dir=None): self.env = gym_env if video_dir is not None: self.env = Monitor(env=self.env, directory=videodir, resume=True) self.buffer_size = buffer_size self.num_actions = gym_env.action_space.n # TBD: Workaround for pong and breakout actions # Agent available actions, such as LEFT, RIGHT, NOOP, etc... self.gym_actions = range(gym_env.action_space.n) # Screen buffer of size buffer_size to be able to build # state arrays of size [1, buffer_size, 84, 84] self.state_buffer = deque() def start_state(self): """ Resets the atari game, clears the state buffer. """ # Clear the state buffer self.state_buffer = deque() x_t = self.env.reset() x_t = self.get_preprocessed_frame(x_t) s_t = np.stack([x_t for i in range(self.buffer_size)], axis=0) for i in range(self.buffer_size - 1): self.state_buffer.append(x_t) return s_t def get_preprocessed_frame(self, observation): """ 0) Atari frames: 210 x 160 1) Get image grayscale 2) Rescale image 110 x 84 3) Crop center 84 x 84 (you can crop top/bottom according to the game) """ return resize(rgb2gray(observation), (110, 84), mode='constant')[13:110 - 13, :] def step(self, action_index): """ Executes an action in the gym environment. Builds current state (concatenation of buffer_size-1 previous frames and current one). Pops oldest frame, adds current frame to the state buffer. Returns current state. """ #x_t1, r_t, terminal, info = self.env.step(self.gym_actions[action_index]) x_t1, r_t, terminal, info = self.env.step(action_index) x_t1 = self.get_preprocessed_frame(x_t1) previous_frames = np.array(self.state_buffer) s_t1 = np.empty((self.buffer_size, 84, 84)) s_t1[:self.buffer_size - 1, :] = previous_frames s_t1[self.buffer_size - 1] = x_t1 # Pop the oldest frame, add the current frame to the queue self.state_buffer.popleft() self.state_buffer.append(x_t1) return s_t1, r_t, terminal, info
""" example.py """ import numpy as np import gym from gym.wrappers import Monitor import roboschool # make the ant environment env = gym.make("RoboschoolAnt-v1") # make a monitor to record the video monitor = Monitor(env, "randomAnt", force=True) monitor.reset() # run one episode of the random agent on Ant done = False while not done: _, _, done, _ = monitor.step(env.action_space.sample())
def monitor(self, path): self.gym = Monitor(self.gym, path)
sync_tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True, save_code=True) writer = SummaryWriter(f"/tmp/{experiment_name}") # TRY NOT TO MODIFY: seeding device = torch.device( 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) env = wrap_atari(env) env = gym.wrappers.RecordEpisodeStatistics( env) # records episode reward in `info['episode']['r']` if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') env = wrap_deepmind( env, clip_rewards=True, frame_stack=True, scale=False, ) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) # respect the default timelimit assert isinstance(env.action_space,
for i in range(len_trajectory): p_guide = pyro.param("p_guide_{}".format(i), torch.ones(4) / 4, constraint=constraints.simplex) action = pyro.sample("action_{}".format(i), dist.Categorical(p_guide)) prob_1 = convert_to_prob(transition(state_1, action), state_1) state_1 = unhash_state( pyro.sample("state_{}".format(i), dist.Categorical(torch.tensor(prob_1)))) # Build an environment # Create and record episode - remove Monitor statement if recording not desired env = Monitor(gym.make('one-random-evader-v0'), './tmp/pursuit_evasion_infer_pursuer_vs_random_evader', force=True) ##Reset state state_gym = env.reset() current_state = state while (current_state != final_state): print("############################") print("Inferring new set of actions") print("############################") print() pyro.clear_param_store() svi = pyro.infer.SVI(model=agent_model, guide=agent_guide, optim=pyro.optim.SGD({
def wrap_env(env): env = Monitor(env, "/content/video", force=True) return env
config=vars(args), name=experiment_name, monitor_gym=True, save_code=True) writer = SummaryWriter(f"/tmp/{experiment_name}") # TRY NOT TO MODIFY: seeding device = torch.device( 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) env = wrap_atari(env) env = gym.wrappers.RecordEpisodeStatistics( env) # records episode reward in `info['episode']['r']` if args.capture_video: env = QValueVisualizationWrapper(env) env = Monitor(env, f'videos/{experiment_name}') env = wrap_deepmind( env, clip_rewards=True, frame_stack=True, scale=False, ) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) # respect the default timelimit assert isinstance(env.action_space,
def main(): global RENDER_DELAY parser = argparse.ArgumentParser( description=('Train policy on OpenAI Gym environment ' 'using pepg, ses, openes, ga, cma')) parser.add_argument('gamename', type=str, help='robo_pendulum, robo_ant, robo_humanoid, etc.') parser.add_argument('-f', '--filename', type=str, help='json filename', default='none') parser.add_argument('-e', '--eval_steps', type=int, default=100, help='evaluate this number of step if final_mode') parser.add_argument('-s', '--seed_start', type=int, default=0, help='initial seed') parser.add_argument('-w', '--single_weight', type=float, default=-100, help='single weight parameter') parser.add_argument('--stdev', type=float, default=2.0, help='standard deviation for weights') parser.add_argument( '--sweep', type=int, default=-1, help='sweep a set of weights from -2.0 to 2.0 sweep times.') parser.add_argument('--lo', type=float, default=-2.0, help='slow side of sweep.') parser.add_argument('--hi', type=float, default=2.0, help='high side of sweep.') args = parser.parse_args() assert len(sys.argv) > 1, 'python model.py gamename path_to_mode.json' gamename = args.gamename use_model = False game = config.games[gamename] filename = args.filename if filename != "none": use_model = True print("filename", filename) the_seed = args.seed_start model = make_model(game) print('model size', model.param_count) eval_steps = args.eval_steps single_weight = args.single_weight weight_stdev = args.stdev num_sweep = args.sweep sweep_lo = args.lo sweep_hi = args.hi model.make_env(render_mode=render_mode) if use_model: model.load_model(filename) else: if single_weight > -100: params = model.get_single_model_params( weight=single_weight - game.weight_bias) # REMEMBER TO UNBIAS print("single weight value set to", single_weight) else: params = model.get_uniform_random_model_params( stdev=weight_stdev) - game.weight_bias model.set_model_params(params) if final_mode: if num_sweep > 1: the_weights = np.arange( sweep_lo, sweep_hi + (sweep_hi - sweep_lo) / num_sweep, (sweep_hi - sweep_lo) / num_sweep) for i in range(len(the_weights)): the_weight = the_weights[i] params = model.get_single_model_params( weight=the_weight - game.weight_bias) # REMEMBER TO UNBIAS model.set_model_params(params) rewards = [] for i in range(eval_steps): reward, steps_taken = simulate(model, train_mode=False, render_mode=False, num_episode=1, seed=the_seed + i) rewards.append(reward[0]) print("weight", the_weight, "average_reward", np.mean(rewards), "standard_deviation", np.std(rewards)) else: rewards = [] for i in range(eval_steps): ''' random uniform params params = model.get_uniform_random_model_params(stdev=weight_stdev)-game.weight_bias model.set_model_params(params) ''' reward, steps_taken = simulate(model, train_mode=False, render_mode=False, num_episode=1, seed=the_seed + i) print(i, reward) rewards.append(reward[0]) print("seed", the_seed, "average_reward", np.mean(rewards), "standard_deviation", np.std(rewards)) else: if record_video: model.env = Monitor(model.env, directory='/tmp/' + gamename, video_callable=lambda episode_id: True, write_upon_reset=True, force=True) for i in range(1): reward, steps_taken = simulate(model, train_mode=False, render_mode=render_mode, num_episode=1, seed=the_seed + i) print("terminal reward", reward, "average steps taken", np.mean(steps_taken) + 1)
def wrap_gym_env(env): from gym.wrappers import Monitor env = Monitor(env, './video', force=True) return env
def monitor_start(self, instance_id, directory, force, resume): env = self._lookup_env(instance_id) self.envs[instance_id] = Monitor(env, directory, None, force, resume)
"wb")) # load model for testing dqn.load_weights( '/home/am/Desktop/set_tests/final/duel_dqn_%d_%s_weights.h5f' % (scale, ENV_NAME)) # setting up monitoring tools to record the testing episodes from gym import monitoring from gym.wrappers import Monitor def episode5(episode_id): if episode_id < 1: return True else: return False #rec = StatsRecorder(env,"sarsa_1") #rec.capture_frame() temp = '/home/am/Desktop/set_tests/final/duel_dqn_%d_%s' % (scale, ENV_NAME) env = Monitor(env, temp, force=True, video_callable=episode5) # testing dqn.test(env, nb_episodes=5, visualize=False, nb_max_episode_steps=2000) env.close() results = monitoring.load_results(temp)
def create_env(name_env, wrapped): env = gym.make(name_env) if wrapped: env = Monitor(env, './video', force=True) return env
states_v = torch.tensor(states) actions_v = torch.tensor(actions) rewards_v = torch.tensor(rewards) last_states_v = torch.tensor(last_states) last_state_q_v = net(last_states_v) best_last_q_v = torch.max(last_state_q_v, dim=1)[0] best_last_q_v[done_masks] = 0.0 return states_v, actions_v, best_last_q_v * gamma + rewards_v if __name__ == "__main__": device = torch.device("cpu") env = make_env(DEFAULT_ENV_NAME) env = Monitor(env, directory="mon", force=True) net = DQN(env.observation_space.shape[0], HIDDEN_SIZE, env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) writer = SummaryWriter(comment="-" + DEFAULT_ENV_NAME) print(net) selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=EPSILON_START) agent = ptan.agent.DQNAgent(net, selector) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE) optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) total_rewards = [] best_m_reward = None
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: sess: Tensorflow Session object env: OpenAI environment q_estimator: Estimator object used for the q values target_estimator: Estimator object used for the targets state_processor: A StateProcessor object num_episodes: Number of episodes to run for experiment_dir: Directory to save Tensorflow summaries in replay_memory_size: Size of the replay memory replay_memory_init_size: Number of random experiences to sampel when initializing the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps discount_factor: Gamma discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done epsilon_decay_steps: Number of steps to decay epsilon over batch_size: Size of batches to sample from the replay memory record_video_every: Record a video every N episodes Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) # The replay memory replay_memory = [] # Keeps track of useful statistics stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) # Get the current time step total_t = sess.run(tf.contrib.framework.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following policy = make_epsilon_greedy_policy( q_estimator, len(VALID_ACTIONS)) # Populate the replay memory with initial experience print("Populating replay memory...") ############################################################ # YOUR CODE 1 : Populate replay memory! # Hints : use function "populate_replay_buffer" # about 1 line code replay_memory = populate_replay_buffer( sess, env, state_processor, replay_memory_init_size, VALID_ACTIONS, Transition, policy ) # Record videos env= Monitor(env, directory=monitor_path, resume=True, video_callable=lambda count: count % record_video_every == 0) for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment state = env.reset() state = state_process(sess, state_processor, state) loss = None # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps-1)] # Add epsilon to Tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="epsilon") q_estimator.summary_writer.add_summary(episode_summary, total_t) ########################################################### # YOUR CODE 2: Target network update # Hints : use function "copy_model_parameters" if total_t % update_target_estimator_every == 0: copy_model_parameters(sess, q_estimator, target_estimator) # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {} Memory Len {} ".format( t, total_t, i_episode + 1, num_episodes, loss, len(replay_memory)), end="") sys.stdout.flush() ############################################## # YOUR CODE 3: Take a step in the environment # Hints 1 : be careful to use function 'state_process' to deal the RPG state # Hints 2 : you can see function "populate_replay_buffer()" # for detail about how to TAKE A STEP # about 2 or 3 line codes action = np.random.choice(len(VALID_ACTIONS), p=policy(sess, state, epsilon)) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) # If our replay memory is full, pop the first element if len(replay_memory) == replay_memory_size: replay_memory.pop(0) ############################# # YOUR CODE 4: Save transition to replay memory # Hints : you can see function 'populate_replay_buffer' for detail # about 1 or 2 line codes replay_memory.append( Transition( state, action, reward, next_state, done ) ) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t ######################################################### # YOUR CODE 5: Sample a minibatch from the replay memory, # hints: can use function "random.sample( replay_memory, batch_size )" to get minibatch # about 1-2 lines codes minibatch = np.array(random.sample(replay_memory, batch_size)) state_batch, action_batch, reward_batch, next_state_batch, done_batch = map(np.array, zip(*minibatch)) ########################################################### # YOUR CODE 6: use minibatch sample to calculate q values and targets # Hints 1 : use function 'q_estimator.predict' to get q values # Hints 2 : use function 'target_estimator.predict' to get targets values # remember 'targets = reward + gamma * max q( s, a' )' # about 2 line codes q_next = q_estimator.predict(sess,next_state_batch) a_max = np.argmax(q_next,axis=1) q_target = target_estimator.predict(sess,next_state_batch) done_batch = np.invert(done_batch).astype('float32') targets = reward_batch + done_batch * discount_factor * q_target[np.arange(batch_size),a_max] #print(done_batch,targets,q_target[np.arange(batch_size),a_max]) ################################################ # YOUR CODE 7: Perform gradient descent update # hints : use function 'q_estimator.update' # about 1 line code loss = q_estimator.update(sess,state_batch, np.array(action_batch), targets) if done: break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward") episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length") q_estimator.summary_writer.add_summary(episode_summary, total_t) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode+1], episode_rewards=stats.episode_rewards[:i_episode+1]) env.close() return stats
def main(args): global RENDER_DELAY env_name = args.env_name filename = args.filename the_seed = args.seed final_mode = args.final_mode generate_data_mode = args.generate_data_mode render_mode = args.render_mode record_video = args.record_video max_length = args.max_length if env_name.startswith("bullet"): RENDER_DELAY = True use_model = False model = make_model() print('model size', model.param_count) model.make_env(env_name, render_mode=render_mode) if len(filename) > 0: model.load_model(filename) else: params = model.get_random_model_params(stdev=0.1) model.set_model_params(params) if final_mode: total_reward = 0.0 np.random.seed(the_seed) model.env.seed(the_seed) for i in range(100): reward, steps_taken = simulate(model, train_mode=False, render_mode=False, num_episode=1, max_len=max_length, generate_data_mode=False) total_reward += reward[0] print("episode", i, "reward =", reward[0]) print("seed", the_seed, "average_reward", total_reward / 100) else: if record_video: model.env = Monitor(model.env, directory='./videos', video_callable=lambda episode_id: True, write_upon_reset=True, force=True) while (5): reward, steps_taken = simulate( model, train_mode=False, render_mode=render_mode, num_episode=1, max_len=max_length, generate_data_mode=generate_data_mode) print("terminal reward", reward, "average steps taken", np.mean(steps_taken) + 1)
def batch_evaluate(agent, env_name, seed, episodes, return_obss_actions=False, pixel=False, monitor_gym=False, pairs_dict=None, model_path=None): num_envs = min(256, episodes) envs = [] for i in range(num_envs): if '_c' in env_name: env = gym.make(env_name, pairs_dict=pairs_dict, test_mode=True) else: env = gym.make(env_name) if pixel: env = RGBImgPartialObsWrapper(env) if monitor_gym: demo_path = os.path.join(model_path, 'gym_demos') if not i % 64: env = Monitor( env, demo_path, video_callable=lambda episode_id: episode_id == 1, force=True) else: env = Monitor(env, demo_path, video_callable=False, force=True) envs.append(env) env = ManyEnvs(envs) logs = { "num_frames_per_episode": [], "return_per_episode": [], "observations_per_episode": [], "actions_per_episode": [], "seed_per_episode": [], "seen_missions": [env.mission for env in envs] } for i in range((episodes + num_envs - 1) // num_envs): seeds = range(seed + i * num_envs, seed + (i + 1) * num_envs) env.seed(seeds) many_obs = env.reset() cur_num_frames = 0 num_frames = np.zeros((num_envs, ), dtype='int64') returns = np.zeros((num_envs, )) already_done = np.zeros((num_envs, ), dtype='bool') if return_obss_actions: obss = [[] for _ in range(num_envs)] actions = [[] for _ in range(num_envs)] while (num_frames == 0).any(): action = agent.act_batch(many_obs)['action'] if return_obss_actions: for i in range(num_envs): if not already_done[i]: obss[i].append(many_obs[i]) actions[i].append(action[i].item()) many_obs, reward, done, _ = env.step(action) agent.analyze_feedback(reward, done) done = np.array(done) just_done = done & (~already_done) returns += reward * just_done cur_num_frames += 1 num_frames[just_done] = cur_num_frames already_done[done] = True logs["num_frames_per_episode"].extend(list(num_frames)) logs["return_per_episode"].extend(list(returns)) logs["seed_per_episode"].extend(list(seeds)) if return_obss_actions: logs["observations_per_episode"].extend(obss) logs["actions_per_episode"].extend(actions) return logs
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: sess: Tensorflow Session object env: OpenAI environment q_estimator: Estimator object used for the q values target_estimator: Estimator object used for the targets state_processor: A StateProcessor object num_episodes: Number of episodes to run for experiment_dir: Directory to save Tensorflow summaries in replay_memory_size: Size of the replay memory replay_memory_init_size: Number of random experiences to sampel when initializing the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps discount_factor: Gamma discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done epsilon_decay_steps: Number of steps to decay epsilon over batch_size: Size of batches to sample from the replay memory record_video_every: Record a video every N episodes Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ Transition = namedtuple( "Transition", ["state", "action", "reward", "next_state", "done"]) # The replay memory replay_memory = [] # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) total_t = sess.run(tf.contrib.framework.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS)) # Populate the replay memory with initial experience print("Populating replay memory...") state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) for i in range(replay_memory_init_size): action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps - 1)]) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) replay_memory.append( Transition(state, action, reward, next_state, done)) if done: state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) else: state = next_state # Record videos # Use the gym env Monitor wrapper env = Monitor(env, directory=monitor_path, resume=True, video_callable=lambda count: count % record_video_every == 0) for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) loss = None # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] # Add epsilon to Tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="epsilon") q_estimator.summary_writer.add_summary(episode_summary, total_t) # Maybe update the target estimator if total_t % update_target_estimator_every == 0: copy_model_parameters(sess, q_estimator, target_estimator) print("\nCopied model parameters to target network.") # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( t, total_t, i_episode + 1, num_episodes, loss), end="") sys.stdout.flush() # Take a step action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) # If our replay memory is full, pop the first element if len(replay_memory) == replay_memory_size: replay_memory.pop(0) # Save transition to replay memory replay_memory.append( Transition(state, action, reward, next_state, done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # Sample a minibatch from the replay memory samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map( np.array, zip(*samples)) # Calculate q values and targets (Double DQN) q_values_next = q_estimator.predict(sess, next_states_batch) best_actions = np.argmax(q_values_next, axis=1) q_values_next_target = target_estimator.predict( sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \ discount_factor * q_values_next_target[np.arange(batch_size), best_actions] # Perform gradient descent update states_batch = np.array(states_batch) loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) if done: break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add( simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward") episode_summary.value.add( simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length") q_estimator.summary_writer.add_summary(episode_summary, total_t) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode + 1], episode_rewards=stats.episode_rewards[:i_episode + 1]) env.monitor.close() return stats
die = done life = info['ale.lives'] # If our replay memory is full, pop the first element replay_memory.append(Transition(state, action, reward, next_state, die)) if done: state = env.reset() state = pre_proc(state) state = np.stack([state] * 4, axis=2) life = 0 else: state = next_state print('Initialize replay buffer: done!') # Record videos env = Monitor(env, directory=monitor_path, resume=True, video_callable=lambda count: count % record_video_every == 0) total_t = 0 for i_episode in range(num_episodes): loss = None state = env.reset() state = pre_proc(state) state = np.stack([state] * 4, axis=2) life = 0 # One step in the environment for t in itertools.count(): # Choose random action if not yet start learning epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] action = select_epilson_greedy_action(q_estimator, state, epsilon) next_state, reward, done, info = env.step(action)
def make_training_env(cube_goal_pose, goal_difficulty, action_space, frameskip=1, sim=False, visualization=False, reward_fn=None, termination_fn=None, initializer=None, episode_length=120000, residual=False, rank=0, monitor=False): is_level_4 = goal_difficulty == 4 reward_fn = get_reward_fn(reward_fn) initializer = get_initializer(initializer) termination_fn = get_termination_fn(termination_fn) if action_space not in [ 'torque', 'position', 'torque_and_position', 'position_and_torque' ]: raise ValueError(f"Unknown action space: {action_space}.") if action_space == 'torque': action_type = ActionType.TORQUE elif action_space in ['torque_and_position', 'position_and_torque']: action_type = ActionType.TORQUE_AND_POSITION else: action_type = ActionType.POSITION env = RealRobotCubeEnv(cube_goal_pose, goal_difficulty, action_type=action_type, frameskip=frameskip, sim=sim, visualization=visualization, reward_fn=reward_fn, termination_fn=termination_fn, initializer=initializer, episode_length=episode_length) env.seed(seed=rank) env.action_space.seed(seed=rank) if visualization: env = PyBulletClearGUIWrapper(env) if monitor: from gym.wrappers import Monitor from code.const import TMP_VIDEO_DIR env = Monitor(RenderWrapper(env), TMP_VIDEO_DIR, video_callable=lambda episode_id: True, mode='evaluation') if residual: if action_space == 'torque': # env = JointConfInitializationWrapper(env, heuristic=grasp) env = ResidualLearningFCWrapper(env, apply_torques=is_level_4, is_level_4=is_level_4) elif action_space == 'torque_and_position': env = ResidualLearningMotionPlanningFCWrapper( env, apply_torques=is_level_4, action_repeat=2, align_goal_ori=is_level_4, use_rrt=is_level_4, init_cube_manip='flip_and_grasp' if is_level_4 else 'grasp', evaluation=False) else: raise ValueError(f"Can't do residual learning with {action_space}") env = FlatObservationWrapper(env) return env
break strategy.update_step() scores_window.append(score) mean_score_window = np.mean(scores_window) scores.append(mean_score_window) episodes.set_description('Average Score = {:.2f}\tExploration rate = {:.2f}'.format(mean_score_window, strategy.get_exploration_rate(False))) except: pass print('\nSaving the model checkpoint as {}'.format(path)) saveModel(model, path) scores = np.asarray(scores) np.save('scores.npy', scores) plt.plot(scores) plt.show() ''' env = Monitor(env, './video', video_callable=lambda episode_id: True, force=True) state = env.reset() while True: action = agent.act(state) next_state, reward, done, _ = env.step(action) agent.step(state, action, reward, next_state, done) state = next_state if done: break env.close()
state = torch.cat(tuple([state] * 4), dim=1) def get_action(): sample = random.random() epsilon = 0.05 if sample > epsilon: with torch.no_grad(): return (policy_net(state.to(device)).max(1)[1].data[0]).to( torch.device("cpu")) else: return random.randrange(4) env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % 50 == 0, resume=True) for i in [6]: print("Loading Checkpoint from dqn{}.model".format(i)) checkpoint = torch.load("dqn{}.model".format(i)) episode = checkpoint['episode'] policy_net.load_state_dict(checkpoint['state_dict']) for i_episode in range(200): state = env.reset() state = process(state) state = torch.cat(tuple([state] * 4), dim=1) episode_reward = 0 for t in count(): action = get_action() next_state, reward, done, _ = env.step(action)
action_state_value[[indices], [actions]] = next_action_state_value # self.model.fit(states, action_state_value, epochs=1, verbose=0) #################################################################################################### # Run File_Epsilon = open(str(FILE_EPSILON), 'a+') File_Rewards = open(str(FILE_REWARDS), 'a+') env = gym.make('LunarLander-v2') if RECORD == True: env = Monitor(env=env, directory=PATH_VIDEO, force=True) env.seed(0) action_space = env.action_space.n state_space = env.observation_space.shape[0] agent = Agent(action_space, state_space) if path.exists(PATH_WEIGHTS): agent.model.load_weights(PATH_WEIGHTS) rewards = [] for episode in range(EPISODES): state = env.reset() state = np.reshape(state, (1, state_space)) score = 0
num_steps = 0 state = env.reset() state = process(state) state = torch.cat(tuple([state] * 4), dim=1) def get_action(): sample = random.random() epsilon = 0.05 if sample > epsilon: with torch.no_grad(): return (policy_net(state.to(device)).max(1)[1].data[0]).to(torch.device("cpu")) else: return random.randrange(4) env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % 50 == 0, resume=True) for i in [6]: print("Loading Checkpoint from dqn{}.model".format(i)) checkpoint = torch.load("dqn{}.model".format(i)) episode = checkpoint['episode'] policy_net.load_state_dict(checkpoint['state_dict']) for i_episode in range(200): state = env.reset() state = process(state) state = torch.cat(tuple([state] * 4), dim=1) episode_reward = 0 for t in count(): action = get_action() next_state, reward, done, _ = env.step(action) num_steps+=1
def wrap_env(self, env: gym.Env): global logger logger = logging.getLogger(__name__) args = p.parse_args() if args.artificial_timelimit: logger.info('Wrapping with Timelimit') env = TimeLimit(env, max_episode_steps=args.artificial_timelimit) if not args.no_monitor: env = Monitor( env, osp.join(self.manager.logdir, 'openai_monitor'), video_callable=lambda ep_id: capped_quadratic_video_schedule( ep_id, args.monitor_video_freq), force=True, mode='evaluation' if args.eval_mode else 'training') if '-ramNoFrameskip-v4' in self.manager.env_id: # for playing atari from ram logger.info('Atari RAM env detected') logger.info('Wrapping with Fire Reset') env = FireResetEnv(env) if args.atari_episodic_life: logger.info('Wrapping with EpisodicLife') env = EpisodicLifeEnv(env) logger.info('Wrapping with NoopReset') env = NoopResetEnv(env, noop_max=args.atari_noop_max) logger.info('Wrapping with Frameskip') env = FrameSkipWrapper(env, skip=args.atari_frameskip) if args.framestack > 1: logger.info('Wrapping with Framestack') env = LinearFrameStackWrapper(env, k=args.framestack) if args.atari_clip_rewards: logger.info('Wrapping with ClipRewards') env = ClipRewardEnv(env) self.frameskip = args.atari_frameskip self.framestack = args.framestack # Some Image obs environment elif isinstance( env.observation_space, gym.spaces.Box) and len(env.observation_space.shape) >= 2: if 'NoFrameskip-v4' in self.manager.env_id: logger.info('Atari env detected') logger.info('Wrapping with Fire Reset') env = FireResetEnv(env) logger.info('Wrapping with AtariPreprocessing') env = AtariPreprocessing( env, noop_max=args.atari_noop_max, frame_skip=args.atari_frameskip, terminal_on_life_loss=args.atari_episodic_life) logger.info('Wrapping with Framestack') env = FrameStack(env, args.atari_framestack) if args.atari_clip_rewards: logger.info('Wrapping with ClipRewards') env = ClipRewardEnv(env) self.frameskip = args.atari_frameskip self.framestack = args.atari_framestack else: logger.info('Some image based env detected') if args.frameskip > 1: logger.info('Wrapping with Frameskip') env = FrameSkipWrapper(env, skip=args.frameskip) if args.framestack > 1: logger.info('Wrapping with Framestack') env = FrameStack(env, args.framestack) self.frameskip = args.frameskip self.framestack = args.framestack else: if args.frameskip > 1: logger.info('Wrapping with Frameskip') env = FrameSkipWrapper(env, skip=args.frameskip) if args.framestack > 1: logger.info('Wrapping with Framestack') env = LinearFrameStackWrapper(env, k=args.framestack) self.frameskip = args.frameskip self.framestack = args.framestack return env
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: sess: Tensorflow Session object env: OpenAI environment q_estimator: Estimator object used for the q values target_estimator: Estimator object used for the targets state_processor: A StateProcessor object num_episodes: Number of episodes to run for experiment_dir: Directory to save Tensorflow summaries in replay_memory_size: Size of the replay memory replay_memory_init_size: Number of random experiences to sampel when initializing the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps discount_factor: Gamma discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done epsilon_decay_steps: Number of steps to decay epsilon over batch_size: Size of batches to sample from the replay memory record_video_every: Record a video every N episodes Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) # The replay memory replay_memory = [] # Keeps track of useful statistics stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) total_t = sess.run(tf.contrib.framework.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following policy = make_epsilon_greedy_policy( q_estimator, len(VALID_ACTIONS)) # Populate the replay memory with initial experience print("Populating replay memory...") state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) for i in range(replay_memory_init_size): action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)]) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) replay_memory.append(Transition(state, action, reward, next_state, done)) if done: state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) else: state = next_state # Record videos # Use the gym env Monitor wrapper env = Monitor(env, directory=monitor_path, resume=True, video_callable=lambda count: count % record_video_every ==0) for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) loss = None # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps-1)] # Add epsilon to Tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="epsilon") q_estimator.summary_writer.add_summary(episode_summary, total_t) # Maybe update the target estimator if total_t % update_target_estimator_every == 0: copy_model_parameters(sess, q_estimator, target_estimator) print("\nCopied model parameters to target network.") # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( t, total_t, i_episode + 1, num_episodes, loss), end="") sys.stdout.flush() # Take a step action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) # If our replay memory is full, pop the first element if len(replay_memory) == replay_memory_size: replay_memory.pop(0) # Save transition to replay memory replay_memory.append(Transition(state, action, reward, next_state, done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # Sample a minibatch from the replay memory samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples)) # Calculate q values and targets (Double DQN) q_values_next = q_estimator.predict(sess, next_states_batch) best_actions = np.argmax(q_values_next, axis=1) q_values_next_target = target_estimator.predict(sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \ discount_factor * q_values_next_target[np.arange(batch_size), best_actions] # Perform gradient descent update states_batch = np.array(states_batch) loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) if done: break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward") episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length") q_estimator.summary_writer.add_summary(episode_summary, total_t) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode+1], episode_rewards=stats.episode_rewards[:i_episode+1]) env.monitor.close() return stats
def main(): startTime = time.time() env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) results_file = open("Results12.csv", 'a') agent = DDPG(env, results_file) env = Monitor(env, directory='experiments/' + ENV_NAME, force=True) results_file.write("Episodes Spent Training; " + str(TEST) + " Episode Eval Avg \n") for episode in range(EPISODES): state = env.reset() if (episode % 20 == 0): print("episode:", episode) # Train for step in range(env.spec.timestep_limit): action = agent.noise_action(state) next_state, reward, done, _ = env.step(action) agent.perceive(state, action, reward, next_state, done) state = next_state if done: break # Testing: if (episode + 1) % 100 == 0 and episode > 100: total_reward = 0 for i in range(TEST): state = env.reset() for j in range(env.spec.timestep_limit): env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break ave_reward = total_reward / TEST print('episode: ', episode, 'Evaluation Average Reward:', ave_reward) results_file.write(str(episode) + "; " + str(ave_reward) + "\n") results_file.write("Time Training (" + str(EPISODES) + "episodes);" + str(time.time() - startTime) + "\n") results_file.write("Evaluation Episode; Reward \n") for episode in range(100): total_reward = 0 env.reset() state = env.env.env.set_test(episode) for j in range(env.spec.timestep_limit): action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break results_file.write(str(episode) + "; " + str(total_reward) + "\n") results_file.write("endExperiment\n\n") results_file.close()
name=experiment_name, monitor_gym=True, save_code=True, ) writer = SummaryWriter(f"/tmp/{experiment_name}") # TRY NOT TO MODIFY: seeding device = torch.device( "cuda" if torch.cuda.is_available() and args.cuda else "cpu") env = gym.make(args.gym_id) env = wrap_atari(env) env = gym.wrappers.RecordEpisodeStatistics( env) # records episode reward in `info['episode']['r']` if args.capture_video: env = QValueVisualizationWrapper(env) env = Monitor(env, f"videos/{experiment_name}") env = wrap_deepmind( env, clip_rewards=True, frame_stack=True, scale=False, ) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) # respect the default timelimit assert isinstance(env.action_space,
def wrap_env(env): env = Monitor(env, './video', force=True) return env
class PolicyMonitor(object): """ Helps evaluating a policy by running an episode in an environment, saving a video, and plotting summaries to Tensorboard. Args: env: environment to run in policy_net: A policy estimator summary_writer: a tf.train.SummaryWriter used to write Tensorboard summaries """ def __init__(self, env, policy_net, summary_writer, saver=None): self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos") self.video_dir = os.path.abspath(self.video_dir) self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True) self.global_policy_net = policy_net self.summary_writer = summary_writer self.saver = saver self.sp = StateProcessor() self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model")) try: os.makedirs(self.video_dir) except FileExistsError: pass # Local policy net with tf.variable_scope("policy_eval"): self.policy_net = PolicyEstimator(policy_net.num_outputs) # Op to copy params from global policy/value net parameters self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES)) def _policy_net_predict(self, state, sess): feed_dict = { self.policy_net.states: [state] } preds = sess.run(self.policy_net.predictions, feed_dict) return preds["probs"][0] def eval_once(self, sess): with sess.as_default(), sess.graph.as_default(): # Copy params to local model global_step, _ = sess.run([tf.contrib.framework.get_global_step(), self.copy_params_op]) # Run an episode done = False state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset())) total_reward = 0.0 episode_length = 0 while not done: action_probs = self._policy_net_predict(state, sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = self.env.step(action) next_state = atari_helpers.atari_make_next_state(state, self.sp.process(next_state)) total_reward += reward episode_length += 1 state = next_state # Add summaries episode_summary = tf.Summary() episode_summary.value.add(simple_value=total_reward, tag="eval/total_reward") episode_summary.value.add(simple_value=episode_length, tag="eval/episode_length") self.summary_writer.add_summary(episode_summary, global_step) self.summary_writer.flush() if self.saver is not None: self.saver.save(sess, self.checkpoint_path) tf.logging.info("Eval results at step {}: total_reward {}, episode_length {}".format(global_step, total_reward, episode_length)) return total_reward, episode_length def continuous_eval(self, eval_every, sess, coord): """ Continuously evaluates the policy every [eval_every] seconds. """ try: while not coord.should_stop(): self.eval_once(sess) # Sleep until next evaluation cycle time.sleep(eval_every) except tf.errors.CancelledError: return
def convertir_env(self, env): self.env = Monitor(env, './video', force=True) return self.env
class Environment(object): def __init__(self, game, record=False, width=84, height=84, seed=0): self.game = gym.make(game) self.game.seed(seed) if record: self.game = Monitor(self.game, './video', force=True) self.width = width self.height = height self._toTensor = T.Compose([T.ToPILImage(), T.ToTensor()]) gym_ple def play_sample(self, mode: str = 'human'): observation = self.game.reset() while True: screen = self.game.render(mode=mode) if mode == 'rgb_array': screen = self.preprocess(screen) action = self.game.action_space.sample() observation, reward, done, info = self.game.step(action) if done: break self.game.close() def preprocess(self, screen): preprocessed: np.array = cv2.resize( screen, (self.height, self.width)) # 84 * 84 로 변경 preprocessed = np.dot(preprocessed[..., :3], [0.299, 0.587, 0.114]) # Gray scale 로 변경 # preprocessed: np.array = preprocessed.transpose((2, 0, 1)) # (C, W, H) 로 변경 preprocessed: np.array = preprocessed.astype('float32') / 255. return preprocessed def init(self): """ @return observation """ return self.game.reset() def get_screen(self): screen = self.game.render('rgb_array') screen = self.preprocess(screen) return screen def step(self, action: int): observation, reward, done, info = self.game.step(action) return observation, reward, done, info def reset(self): """ :return: observation array """ observation = self.game.reset() observation = self.preprocess(observation) return observation @property def action_space(self): return self.game.action_space.n
def set_video_dir(self, video_dir): self._env = Monitor( env=self._env, directory=video_dir, video_callable=lambda x: True )