示例#1
0
def test_video_callable_records_videos():
    with helpers.tempdir() as temp:
        env = gym.make('CartPole-v0')
        env = Monitor(env, temp)
        env.reset()
        env.close()
        results = monitoring.load_results(temp)
        assert len(results['videos']) == 1, "Videos: {}".format(results['videos'])
示例#2
0
def test_monitor_filename():
    with helpers.tempdir() as temp:
        env = gym.make('CartPole-v0')
        env = Monitor(env, directory=temp)
        env.close()

        manifests = glob.glob(os.path.join(temp, '*.manifest.*'))
        assert len(manifests) == 1
示例#3
0
def test_video_callable_false_does_not_record():
    with helpers.tempdir() as temp:
        env = gym.make('CartPole-v0')
        env = Monitor(env, temp, video_callable=False)
        env.reset()
        env.close()
        results = monitoring.load_results(temp)
        assert len(results['videos']) == 0
示例#4
0
class OpenAIGym(Environment):

    def __init__(self, gym_id, monitor=None, monitor_safe=False, monitor_video=0):
        """
        Initialize OpenAI Gym.

        Args:
            gym_id: OpenAI Gym environment ID. See https://gym.openai.com/envs
            monitor: Output directory. Setting this to None disables monitoring.
            monitor_safe: Setting this to True prevents existing log files to be overwritten. Default False.
            monitor_video: Save a video every monitor_video steps. Setting this to 0 disables recording of videos.
        """

        self.gym_id = gym_id
        self.gym = gym.make(gym_id)  # Might raise gym.error.UnregisteredEnv or gym.error.DeprecatedEnv

        if monitor:
            if monitor_video == 0:
                video_callable = False
            else:
                video_callable = (lambda x: x % monitor_video == 0)
            self.gym = Monitor(self.gym, monitor, force=not monitor_safe, video_callable=video_callable)

    def __str__(self):
        return 'OpenAIGym({})'.format(self.gym_id)

    def close(self):
        self.gym = None

    def reset(self):
        return self.gym.reset()

    def execute(self, action):
        if isinstance(self.gym.action_space, gym.spaces.Box):
            action = [action]  # some gym environments expect a list (f.i. Pendulum-v0)
        state, reward, terminal, _ = self.gym.step(action)
        return state, reward, terminal

    @property
    def states(self):
        if isinstance(self.gym.observation_space, Discrete):
            return dict(shape=(), type='float')
        else:
            return dict(shape=tuple(self.gym.observation_space.shape), type='float')

    @property
    def actions(self):
        if isinstance(self.gym.action_space, Discrete):
            return dict(continuous=False, num_actions=self.gym.action_space.n)
        elif len(self.gym.action_space.shape) == 1:
            return dict(continuous=True)
        elif len(self.gym.action_space.shape) > 1:
            return {'action' + str(n): dict(continuous=True) for n in range(len(self.gym.action_space.shape))}
        else:
            raise TensorForceError()

    def monitor(self, path):
        self.gym = Monitor(self.gym, path)
示例#5
0
class GymEnvironment(VideoCapableEnvironment):
    """
    Wraps an Open AI Gym environment
    """

    def __init__(self, env_name, state_builder=ALEStateBuilder(), repeat_action=4, no_op=30, monitoring_path=None):
        assert isinstance(state_builder, StateBuilder), 'state_builder should inherit from StateBuilder'
        assert isinstance(repeat_action, (int, tuple)), 'repeat_action should be int or tuple'
        if isinstance(repeat_action, int):
            assert repeat_action >= 1, "repeat_action should be >= 1"
        elif isinstance(repeat_action, tuple):
            assert len(repeat_action) == 2, 'repeat_action should be a length-2 tuple: (min frameskip, max frameskip)'
            assert repeat_action[0] < repeat_action[1], 'repeat_action[0] should be < repeat_action[1]'

        super(GymEnvironment, self).__init__()

        self._state_builder = state_builder
        self._env = gym.make(env_name)
        self._env.env.frameskip = repeat_action
        self._no_op = max(0, no_op)
        self._done = True

        if monitoring_path is not None:
            self._env = Monitor(self._env, monitoring_path, video_callable=need_record)

    @property
    def available_actions(self):
        return self._env.action_space.n

    @property
    def state(self):
        return None if self._state is None else self._state_builder(self._state)

    @property
    def lives(self):
        return self._env.env.ale.lives()

    @property
    def frame(self):
        return Image.fromarray(self._state)

    def do(self, action):
        self._state, self._reward, self._done, _ = self._env.step(action)
        self._score += self._reward
        return self.state, self._reward, self._done

    def reset(self):
        super(GymEnvironment, self).reset()

        self._state = self._env.reset()

        # Random number of initial no-op to introduce stochasticity
        if self._no_op > 0:
            for _ in six.moves.range(np.random.randint(1, self._no_op)):
                self._state, _, _, _ = self._env.step(0)

        return self.state
示例#6
0
class GymEnvironment(Environment):
    def __init__(self, env_id, directory=None, force=True, monitor_video=0):
        super(GymEnvironment, self).__init__(env_id=env_id)
        self._env = gym.make(env_id)

        if directory:
            if monitor_video == 0:
                video_callable = False
            else:
                video_callable = (lambda x: x % monitor_video == 0)
            self._env = Monitor(self._env, directory, video_callable=video_callable, force=force)

    def __str__(self):
        return 'OpenAIGym({})'.format(self._env_id)

    def close(self):
        if not self._closed:
            self._env.close()
            self._closed = True

    def reset(self, return_spec=True):
        self._reset()
        state = self._env.reset()
        if return_spec:
            return EnvSpec(action=None, state=None, reward=0, done=False, next_state=state)
        return state

    def step(self, action, state, return_spec=True):
        self._step()
        if isinstance(action, (list, np.ndarray)):
            if isinstance(self._env.action_space, Discrete) or isinstance(action, (list, np.ndarray)):
                action = action[0]
        if isinstance(self._env.action_space, Box) and not isinstance(action, (list, np.ndarray)):
            action = list(action)
        next_state, reward, done, _ = self._env.step(action)
        if return_spec:
            return EnvSpec(
                action=action, state=state, reward=reward, done=done, next_state=next_state)
        return next_state, reward, done

    @property
    def num_states(self):
        return self._env.observation_space.shape[0]

    @property
    def num_actions(self):
        if isinstance(self._env.action_space, Box):
            return self._env.action_space.shape[0]
        else:
            return self._env.action_space.n

    @property
    def is_continuous(self):
        return not isinstance(self._env.action_space, Discrete)
示例#7
0
def test_write_upon_reset_false():
    with helpers.tempdir() as temp:
        env = gym.make('CartPole-v0')
        env = Monitor(env, directory=temp, video_callable=False, write_upon_reset=False)
        env.reset()

        files = glob.glob(os.path.join(temp, '*'))
        assert not files, "Files: {}".format(files)

        env.close()
        files = glob.glob(os.path.join(temp, '*'))
        assert len(files) > 0
  def __init__(self, env, policy_net, summary_writer, saver=None):

    self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos")
    self.video_dir = os.path.abspath(self.video_dir)

    self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True)
    self.global_policy_net = policy_net
    self.summary_writer = summary_writer
    self.saver = saver
    self.sp = StateProcessor()

    self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model"))

    try:
      os.makedirs(self.video_dir)
    except FileExistsError:
      pass

    # Local policy net
    with tf.variable_scope("policy_eval"):
      self.policy_net = PolicyEstimator(policy_net.num_outputs)

    # Op to copy params from global policy/value net parameters
    self.copy_params_op = make_copy_params_op(
      tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
      tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES))
示例#9
0
def test_only_complete_episodes_written():
    with helpers.tempdir() as temp:
        env = gym.make('CartPole-v0')
        env = Monitor(env, temp, video_callable=False)
        env.reset()
        d = False
        while not d:
            _, _, d, _ = env.step(env.action_space.sample())

        env.reset()
        env.step(env.action_space.sample())

        env.close()

        # Only 1 episode should be written
        results = monitoring.load_results(temp)
        assert len(results['episode_lengths']) == 1, "Found {} episodes written; expecting 1".format(len(results['episode_lengths']))
示例#10
0
def get_new_env(env_name, cmdl):
    """Configure the training environment and return an instance."""
    import logging
    import gym
    import gym_fast_envs  # noqa
    from gym.wrappers import Monitor

    # Undo the default logger and configure a new one.
    gym.undo_logger_setup()
    logger = logging.getLogger()
    logger.setLevel(logging.WARNING)

    # Configure environment
    outdir = '/tmp/nec/%s-results' % cmdl.label
    env = gym.make(env_name)
    env = Monitor(env, directory=outdir, force=True, video_callable=False)
    env.seed(cmdl.seed)
    return env
示例#11
0
def test_semisuper_succeeds():
    """Regression test. Ensure that this can write"""
    with helpers.tempdir() as temp:
        env = gym.make('SemisuperPendulumDecay-v0')
        env = Monitor(env, temp)
        env.reset()
        env.step(env.action_space.sample())
        env.close()
示例#12
0
    def __init__(self, env_id, directory=None, force=True, monitor_video=0):
        super(GymEnvironment, self).__init__(env_id=env_id)
        self._env = gym.make(env_id)

        if directory:
            if monitor_video == 0:
                video_callable = False
            else:
                video_callable = (lambda x: x % monitor_video == 0)
            self._env = Monitor(self._env, directory, video_callable=video_callable, force=force)
示例#13
0
def test_env_reuse():
    with helpers.tempdir() as temp:
        env = gym.make('Autoreset-v0')
        env = Monitor(env, temp)

        env.reset()

        _, _, done, _ = env.step(None)
        assert not done
        _, _, done, _ = env.step(None)
        assert done

        _, _, done, _ = env.step(None)
        assert not done
        _, _, done, _ = env.step(None)
        assert done

        env.close()
示例#14
0
def cart_pole_with_qlearning():
    from gym.wrappers import Monitor
    env = gym.make('CartPole-v0')
    experiment_filename = './cartpole-experiment-1'
    env = Monitor(env, experiment_filename, force=True)
    observation = env.reset()

    goal_average_steps = 195
    max_number_of_steps = 200
    number_of_iterations_to_average = 100

    number_of_features = env.observation_space.shape[0]
    last_time_steps = np.ndarray(0)

    cart_position_bins = pd.cut([-2.4, 2.4], bins=10, retbins=True)[1][1:-1]
    pole_angle_bins = pd.cut([-2, 2], bins=10, retbins=True)[1][1:-1]
    cart_velocity_bins = pd.cut([-1, 1], bins=10, retbins=True)[1][1:-1]
    angle_rate_bins = pd.cut([-3.5, 3.5], bins=10, retbins=True)[1][1:-1]

    learner = QLearner(state_discretization=Binning([[-2.4, 2.4], [-2, 2], [-1., 1], [-3.5, 3.5]], [10] * 4),
                       discrete_actions=[i for i in range(env.action_space.n)],
                       alpha=0.2,
                       gamma=1,
                       random_action_rate=0.5,
                       random_action_decay_rate=0.99)

    for episode in range(50000):
        action = learner.set_initial_state(observation)

        for step in range(max_number_of_steps - 1):
            observation, reward, done, info = env.step(action)

            if done:
                reward = -200
                observation = env.reset()

            action = learner.move(observation, reward)

            if done:
                last_time_steps = np.append(last_time_steps, [int(step + 1)])
                if len(last_time_steps) > number_of_iterations_to_average:
                    last_time_steps = np.delete(last_time_steps, 0)
                break

        if last_time_steps.mean() > goal_average_steps:
            print "Goal reached!"
            print "Episodes before solve: ", episode + 1
            print u"Best 100-episode performance {} {} {}".format(last_time_steps.max(),
                                                                  unichr(177),  # plus minus sign
                                                                  last_time_steps.std())
            break

    env.close()
示例#15
0
def test_write_upon_reset_true():
    with helpers.tempdir() as temp:
        env = gym.make('CartPole-v0')

        # TODO: Fix Cartpole to not configure itself automatically
        # assert not env._configured
        env = Monitor(env, directory=temp, video_callable=False, write_upon_reset=True)
        env.configure()
        env.reset()

        files = glob.glob(os.path.join(temp, '*'))
        assert len(files) > 0, "Files: {}".format(files)

        env.close()
        files = glob.glob(os.path.join(temp, '*'))
        assert len(files) > 0
示例#16
0
def test_steps_limit_restart():
    with helpers.tempdir() as temp:
        env = gym.make('test.StepsLimitCartpole-v0')
        env = Monitor(env, temp, video_callable=False)
        env.reset()

        # Episode has started
        _, _, done, info = env.step(env.action_space.sample())
        assert done == False

        # Limit reached, now we get a done signal and the env resets itself
        _, _, done, info = env.step(env.action_space.sample())
        assert done == True
        assert env.episode_id == 1

        env.close()
示例#17
0
    def __init__(self, env_name, state_builder=ALEStateBuilder(), repeat_action=4, no_op=30, monitoring_path=None):
        assert isinstance(state_builder, StateBuilder), 'state_builder should inherit from StateBuilder'
        assert isinstance(repeat_action, (int, tuple)), 'repeat_action should be int or tuple'
        if isinstance(repeat_action, int):
            assert repeat_action >= 1, "repeat_action should be >= 1"
        elif isinstance(repeat_action, tuple):
            assert len(repeat_action) == 2, 'repeat_action should be a length-2 tuple: (min frameskip, max frameskip)'
            assert repeat_action[0] < repeat_action[1], 'repeat_action[0] should be < repeat_action[1]'

        super(GymEnvironment, self).__init__()

        self._state_builder = state_builder
        self._env = gym.make(env_name)
        self._env.env.frameskip = repeat_action
        self._no_op = max(0, no_op)
        self._done = True

        if monitoring_path is not None:
            self._env = Monitor(self._env, monitoring_path, video_callable=need_record)
示例#18
0
    def evaluate(self, n_games=1, save_path="./records", use_monitor=True, record_video=True, verbose=True,
                 t_max=100000):
        """Plays an entire game start to end, records the logs(and possibly mp4 video), returns reward.

        :param save_path: where to save the report
        :param record_video: if True, records mp4 video
        :return: total reward (scalar)
        """
        env = self.make_env()

        if not use_monitor and record_video:
            raise warn("Cannot video without gym monitor. If you still want video, set use_monitor to True")

        if record_video :
            env = Monitor(env,save_path,force=True)
        elif use_monitor:
            env = Monitor(env, save_path, video_callable=lambda i: False, force=True)

        game_rewards = []
        for _ in range(n_games):
            # initial observation
            observation = env.reset()
            # initial memory
            prev_memories = [np.zeros((1,) + tuple(mem.output_shape[1:]),
                                      dtype=get_layer_dtype(mem))
                             for mem in self.agent.agent_states]

            t = 0
            total_reward = 0
            while True:

                res = self.agent_step(self.preprocess_observation(observation)[None, ...], *prev_memories)
                action, new_memories = res[0], res[1:]

                observation, reward, done, info = env.step(action[0])

                total_reward += reward
                prev_memories = new_memories

                if done or t >= t_max:
                    if verbose:
                        print("Episode finished after {} timesteps with reward={}".format(t + 1, total_reward))
                    break
                t += 1
            game_rewards.append(total_reward)

        env.close()
        del env
        return game_rewards
示例#19
0
    def __init__(self, gym_id, monitor=None, monitor_safe=False, monitor_video=0):
        """
        Initialize OpenAI Gym.

        Args:
            gym_id: OpenAI Gym environment ID. See https://gym.openai.com/envs
            monitor: Output directory. Setting this to None disables monitoring.
            monitor_safe: Setting this to True prevents existing log files to be overwritten. Default False.
            monitor_video: Save a video every monitor_video steps. Setting this to 0 disables recording of videos.
        """

        self.gym_id = gym_id
        self.gym = gym.make(gym_id)  # Might raise gym.error.UnregisteredEnv or gym.error.DeprecatedEnv

        if monitor:
            if monitor_video == 0:
                video_callable = False
            else:
                video_callable = (lambda x: x % monitor_video == 0)
            self.gym = Monitor(self.gym, monitor, force=not monitor_safe, video_callable=video_callable)
示例#20
0
        updateTargetNetwork = 10000
        explorationRate = 1
        minibatch_size = 64
        learnStart = 64
        learningRate = 0.00025
        discountFactor = 0.99
        memorySize = 1000000
        network_inputs = 100
        network_outputs = 21
        network_structure = [300, 300]
        current_epoch = 0

        deepQ = DeepQ(network_inputs, network_outputs, memorySize,
                      discountFactor, learningRate, learnStart)
        deepQ.initNetworks(network_structure)
        env = Monitor(env, directory=outdir, force=True, write_upon_reset=True)
    else:
        #Load weights, monitor info and parameter info.
        #ADD TRY CATCH fro this else
        with open(params_json) as outfile:
            d = json.load(outfile)
            epochs = d.get('epochs')
            steps = d.get('steps')
            updateTargetNetwork = d.get('updateTargetNetwork')
            explorationRate = d.get('explorationRate')
            minibatch_size = d.get('minibatch_size')
            learnStart = d.get('learnStart')
            learningRate = d.get('learningRate')
            discountFactor = d.get('discountFactor')
            memorySize = d.get('memorySize')
            network_inputs = d.get('network_inputs')
示例#21
0
def test_no_monitor_reset_unless_done():
    def assert_reset_raises(env):
        errored = False
        try:
            env.reset()
        except error.Error:
            errored = True
        assert errored, "Env allowed a reset when it shouldn't have"

    with helpers.tempdir() as temp:
        # Make sure we can reset as we please without monitor
        env = gym.make('CartPole-v0')
        env.reset()
        env.step(env.action_space.sample())
        env.step(env.action_space.sample())
        env.reset()

        # can reset once as soon as we start
        env = Monitor(env, temp, video_callable=False)
        env.reset()

        # can reset multiple times in a row
        env.reset()
        env.reset()

        env.step(env.action_space.sample())
        env.step(env.action_space.sample())
        assert_reset_raises(env)

        # should allow resets after the episode is done
        d = False
        while not d:
            _, _, d, _ = env.step(env.action_space.sample())

        env.reset()
        env.reset()

        env.step(env.action_space.sample())
        assert_reset_raises(env)

        env.close()
示例#22
0
class EnvWrapper(object):
    """
    Small wrapper for gym atari environments.
    Responsible for preprocessing screens and holding on to a screen buffer
    of size buffer_size from which environment state is constructed.
    """
    def __init__(self, gym_env, buffer_size, video_dir=None):
        self.env = gym_env
        if video_dir is not None:
            self.env = Monitor(env=self.env, directory=videodir, resume=True)
        self.buffer_size = buffer_size
        self.num_actions = gym_env.action_space.n
        # TBD: Workaround for pong and breakout actions
        # Agent available actions, such as LEFT, RIGHT, NOOP, etc...
        self.gym_actions = range(gym_env.action_space.n)
        # Screen buffer of size buffer_size to be able to build
        # state arrays of size [1, buffer_size, 84, 84]
        self.state_buffer = deque()

    def start_state(self):
        """
        Resets the atari game, clears the state buffer.
        """
        # Clear the state buffer
        self.state_buffer = deque()

        x_t = self.env.reset()
        x_t = self.get_preprocessed_frame(x_t)
        s_t = np.stack([x_t for i in range(self.buffer_size)], axis=0)

        for i in range(self.buffer_size - 1):
            self.state_buffer.append(x_t)
        return s_t

    def get_preprocessed_frame(self, observation):
        """
        0) Atari frames: 210 x 160
        1) Get image grayscale
        2) Rescale image 110 x 84
        3) Crop center 84 x 84 (you can crop top/bottom according to the game)
        """
        return resize(rgb2gray(observation), (110, 84),
                      mode='constant')[13:110 - 13, :]

    def step(self, action_index):
        """
        Executes an action in the gym environment.
        Builds current state (concatenation of buffer_size-1 previous
        frames and current one). Pops oldest frame, adds current frame to
        the state buffer. Returns current state.
        """

        #x_t1, r_t, terminal, info = self.env.step(self.gym_actions[action_index])
        x_t1, r_t, terminal, info = self.env.step(action_index)
        x_t1 = self.get_preprocessed_frame(x_t1)

        previous_frames = np.array(self.state_buffer)
        s_t1 = np.empty((self.buffer_size, 84, 84))
        s_t1[:self.buffer_size - 1, :] = previous_frames
        s_t1[self.buffer_size - 1] = x_t1

        # Pop the oldest frame, add the current frame to the queue
        self.state_buffer.popleft()
        self.state_buffer.append(x_t1)

        return s_t1, r_t, terminal, info
示例#23
0
文件: example.py 项目: cvik1/swat_e90
"""
example.py
"""

import numpy as np
import gym
from gym.wrappers import Monitor
import roboschool

# make the ant environment
env = gym.make("RoboschoolAnt-v1")
# make a monitor to record the video
monitor = Monitor(env, "randomAnt", force=True)
monitor.reset()

# run one episode of the random agent on Ant

done = False
while not done:
    _, _, done, _ = monitor.step(env.action_space.sample())
示例#24
0
 def monitor(self, path):
     self.gym = Monitor(self.gym, path)
示例#25
0
               sync_tensorboard=True,
               config=vars(args),
               name=experiment_name,
               monitor_gym=True,
               save_code=True)
    writer = SummaryWriter(f"/tmp/{experiment_name}")

# TRY NOT TO MODIFY: seeding
device = torch.device(
    'cuda' if torch.cuda.is_available() and args.cuda else 'cpu')
env = gym.make(args.gym_id)
env = wrap_atari(env)
env = gym.wrappers.RecordEpisodeStatistics(
    env)  # records episode reward in `info['episode']['r']`
if args.capture_video:
    env = Monitor(env, f'videos/{experiment_name}')
env = wrap_deepmind(
    env,
    clip_rewards=True,
    frame_stack=True,
    scale=False,
)
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic
env.seed(args.seed)
env.action_space.seed(args.seed)
env.observation_space.seed(args.seed)
# respect the default timelimit
assert isinstance(env.action_space,
示例#26
0
    for i in range(len_trajectory):
        p_guide = pyro.param("p_guide_{}".format(i),
                             torch.ones(4) / 4,
                             constraint=constraints.simplex)
        action = pyro.sample("action_{}".format(i), dist.Categorical(p_guide))
        prob_1 = convert_to_prob(transition(state_1, action), state_1)
        state_1 = unhash_state(
            pyro.sample("state_{}".format(i),
                        dist.Categorical(torch.tensor(prob_1))))


# Build an environment

# Create and record episode - remove Monitor statement if recording not desired
env = Monitor(gym.make('one-random-evader-v0'),
              './tmp/pursuit_evasion_infer_pursuer_vs_random_evader',
              force=True)

##Reset state
state_gym = env.reset()

current_state = state
while (current_state != final_state):
    print("############################")
    print("Inferring new set of actions")
    print("############################")
    print()
    pyro.clear_param_store()
    svi = pyro.infer.SVI(model=agent_model,
                         guide=agent_guide,
                         optim=pyro.optim.SGD({
示例#27
0
def wrap_env(env):
    env = Monitor(env, "/content/video", force=True)
    return env
示例#28
0
               config=vars(args),
               name=experiment_name,
               monitor_gym=True,
               save_code=True)
    writer = SummaryWriter(f"/tmp/{experiment_name}")

# TRY NOT TO MODIFY: seeding
device = torch.device(
    'cuda' if torch.cuda.is_available() and args.cuda else 'cpu')
env = gym.make(args.gym_id)
env = wrap_atari(env)
env = gym.wrappers.RecordEpisodeStatistics(
    env)  # records episode reward in `info['episode']['r']`
if args.capture_video:
    env = QValueVisualizationWrapper(env)
    env = Monitor(env, f'videos/{experiment_name}')
env = wrap_deepmind(
    env,
    clip_rewards=True,
    frame_stack=True,
    scale=False,
)
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic
env.seed(args.seed)
env.action_space.seed(args.seed)
env.observation_space.seed(args.seed)
# respect the default timelimit
assert isinstance(env.action_space,
示例#29
0
def main():

    global RENDER_DELAY

    parser = argparse.ArgumentParser(
        description=('Train policy on OpenAI Gym environment '
                     'using pepg, ses, openes, ga, cma'))
    parser.add_argument('gamename',
                        type=str,
                        help='robo_pendulum, robo_ant, robo_humanoid, etc.')
    parser.add_argument('-f',
                        '--filename',
                        type=str,
                        help='json filename',
                        default='none')
    parser.add_argument('-e',
                        '--eval_steps',
                        type=int,
                        default=100,
                        help='evaluate this number of step if final_mode')
    parser.add_argument('-s',
                        '--seed_start',
                        type=int,
                        default=0,
                        help='initial seed')
    parser.add_argument('-w',
                        '--single_weight',
                        type=float,
                        default=-100,
                        help='single weight parameter')
    parser.add_argument('--stdev',
                        type=float,
                        default=2.0,
                        help='standard deviation for weights')
    parser.add_argument(
        '--sweep',
        type=int,
        default=-1,
        help='sweep a set of weights from -2.0 to 2.0 sweep times.')
    parser.add_argument('--lo',
                        type=float,
                        default=-2.0,
                        help='slow side of sweep.')
    parser.add_argument('--hi',
                        type=float,
                        default=2.0,
                        help='high side of sweep.')

    args = parser.parse_args()

    assert len(sys.argv) > 1, 'python model.py gamename path_to_mode.json'

    gamename = args.gamename

    use_model = False

    game = config.games[gamename]

    filename = args.filename
    if filename != "none":
        use_model = True
        print("filename", filename)

    the_seed = args.seed_start

    model = make_model(game)
    print('model size', model.param_count)

    eval_steps = args.eval_steps
    single_weight = args.single_weight
    weight_stdev = args.stdev
    num_sweep = args.sweep
    sweep_lo = args.lo
    sweep_hi = args.hi

    model.make_env(render_mode=render_mode)

    if use_model:
        model.load_model(filename)
    else:
        if single_weight > -100:
            params = model.get_single_model_params(
                weight=single_weight - game.weight_bias)  # REMEMBER TO UNBIAS
            print("single weight value set to", single_weight)
        else:
            params = model.get_uniform_random_model_params(
                stdev=weight_stdev) - game.weight_bias
        model.set_model_params(params)

    if final_mode:
        if num_sweep > 1:
            the_weights = np.arange(
                sweep_lo, sweep_hi + (sweep_hi - sweep_lo) / num_sweep,
                (sweep_hi - sweep_lo) / num_sweep)
            for i in range(len(the_weights)):
                the_weight = the_weights[i]
                params = model.get_single_model_params(
                    weight=the_weight - game.weight_bias)  # REMEMBER TO UNBIAS
                model.set_model_params(params)
                rewards = []
                for i in range(eval_steps):
                    reward, steps_taken = simulate(model,
                                                   train_mode=False,
                                                   render_mode=False,
                                                   num_episode=1,
                                                   seed=the_seed + i)
                    rewards.append(reward[0])
                print("weight", the_weight, "average_reward", np.mean(rewards),
                      "standard_deviation", np.std(rewards))
        else:
            rewards = []
            for i in range(eval_steps):
                ''' random uniform params
        params = model.get_uniform_random_model_params(stdev=weight_stdev)-game.weight_bias
        model.set_model_params(params)
        '''
                reward, steps_taken = simulate(model,
                                               train_mode=False,
                                               render_mode=False,
                                               num_episode=1,
                                               seed=the_seed + i)
                print(i, reward)
                rewards.append(reward[0])
            print("seed", the_seed, "average_reward", np.mean(rewards),
                  "standard_deviation", np.std(rewards))
    else:
        if record_video:
            model.env = Monitor(model.env,
                                directory='/tmp/' + gamename,
                                video_callable=lambda episode_id: True,
                                write_upon_reset=True,
                                force=True)
        for i in range(1):
            reward, steps_taken = simulate(model,
                                           train_mode=False,
                                           render_mode=render_mode,
                                           num_episode=1,
                                           seed=the_seed + i)
            print("terminal reward", reward, "average steps taken",
                  np.mean(steps_taken) + 1)
示例#30
0
def wrap_gym_env(env):
    from gym.wrappers import Monitor
    env = Monitor(env, './video', force=True)
    return env
示例#31
0
 def monitor_start(self, instance_id, directory, force, resume):
     env = self._lookup_env(instance_id)
     self.envs[instance_id] = Monitor(env, directory, None, force, resume)
示例#32
0
                          "wb"))

# load model for testing
dqn.load_weights(
    '/home/am/Desktop/set_tests/final/duel_dqn_%d_%s_weights.h5f' %
    (scale, ENV_NAME))

# setting up monitoring tools to record the testing episodes
from gym import monitoring
from gym.wrappers import Monitor


def episode5(episode_id):
    if episode_id < 1:
        return True
    else:
        return False


#rec = StatsRecorder(env,"sarsa_1")
#rec.capture_frame()

temp = '/home/am/Desktop/set_tests/final/duel_dqn_%d_%s' % (scale, ENV_NAME)
env = Monitor(env, temp, force=True, video_callable=episode5)

# testing
dqn.test(env, nb_episodes=5, visualize=False, nb_max_episode_steps=2000)

env.close()
results = monitoring.load_results(temp)
示例#33
0
def create_env(name_env, wrapped):
    env = gym.make(name_env)
    if wrapped:
        env = Monitor(env, './video', force=True)
        return env
    states_v = torch.tensor(states)
    actions_v = torch.tensor(actions)
    rewards_v = torch.tensor(rewards)
    last_states_v = torch.tensor(last_states)
    last_state_q_v = net(last_states_v)
    best_last_q_v = torch.max(last_state_q_v, dim=1)[0]
    best_last_q_v[done_masks] = 0.0
    return states_v, actions_v, best_last_q_v * gamma + rewards_v


if __name__ == "__main__":

    device = torch.device("cpu")

    env = make_env(DEFAULT_ENV_NAME)
    env = Monitor(env, directory="mon", force=True)

    net = DQN(env.observation_space.shape[0], HIDDEN_SIZE, env.action_space.n).to(device)
    tgt_net = ptan.agent.TargetNet(net)

    writer = SummaryWriter(comment="-" + DEFAULT_ENV_NAME)
    print(net)

    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=EPSILON_START)
    agent = ptan.agent.DQNAgent(net, selector)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE)
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

    total_rewards = []
    best_m_reward = None
示例#35
0
def deep_q_learning(sess,
	                env,
	                q_estimator,
	                target_estimator,
	                state_processor,
	                num_episodes,
	                experiment_dir,
	                replay_memory_size=500000,
	                replay_memory_init_size=50000,
	                update_target_estimator_every=10000,
	                discount_factor=0.99,
	                epsilon_start=1.0,
	                epsilon_end=0.1,
	                epsilon_decay_steps=500000,
	                batch_size=32,
	                record_video_every=50):
	"""
	Q-Learning algorithm for off-policy TD control using Function Approximation.
	Finds the optimal greedy policy while following an epsilon-greedy policy.

	Args:
	    sess: Tensorflow Session object
	    env: OpenAI environment
	    q_estimator: Estimator object used for the q values
	    target_estimator: Estimator object used for the targets
	    state_processor: A StateProcessor object
	    num_episodes: Number of episodes to run for
	    experiment_dir: Directory to save Tensorflow summaries in
	    replay_memory_size: Size of the replay memory
	    replay_memory_init_size: Number of random experiences to sampel when initializing 
	      the reply memory.
	    update_target_estimator_every: Copy parameters from the Q estimator to the 
	      target estimator every N steps
	    discount_factor: Gamma discount factor
	    epsilon_start: Chance to sample a random action when taking an action.
	      Epsilon is decayed over time and this is the start value
	    epsilon_end: The final minimum value of epsilon after decaying is done
	    epsilon_decay_steps: Number of steps to decay epsilon over
	    batch_size: Size of batches to sample from the replay memory
	    record_video_every: Record a video every N episodes

	Returns:
	    An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
	"""

	Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

	# The replay memory
	replay_memory = []

	# Keeps track of useful statistics
	stats = plotting.EpisodeStats(
	    episode_lengths=np.zeros(num_episodes),
	    episode_rewards=np.zeros(num_episodes))

	# Create directories for checkpoints and summaries
	checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
	checkpoint_path = os.path.join(checkpoint_dir, "model")
	monitor_path = os.path.join(experiment_dir, "monitor")

	if not os.path.exists(checkpoint_dir):
	    os.makedirs(checkpoint_dir)
	if not os.path.exists(monitor_path):
	    os.makedirs(monitor_path)

	saver = tf.train.Saver()
	# Load a previous checkpoint if we find one
	latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
	if latest_checkpoint:
	    print("Loading model checkpoint {}...\n".format(latest_checkpoint))
	    saver.restore(sess, latest_checkpoint)

	# Get the current time step
	total_t = sess.run(tf.contrib.framework.get_global_step())

	# The epsilon decay schedule
	epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

	# The policy we're following
	policy = make_epsilon_greedy_policy(
	    q_estimator,
	    len(VALID_ACTIONS))

	# Populate the replay memory with initial experience
	print("Populating replay memory...")
	############################################################
	# YOUR CODE 1 : Populate replay memory!
	# Hints : use function "populate_replay_buffer"
	# about 1 line code
	replay_memory = populate_replay_buffer( sess, env, state_processor, replay_memory_init_size, VALID_ACTIONS, Transition, policy )
	
	

	# Record videos
	env= Monitor(env,
	             directory=monitor_path,
	             resume=True,
	             video_callable=lambda count: count % record_video_every == 0)

	for i_episode in range(num_episodes):
		# Save the current checkpoint
		saver.save(tf.get_default_session(), checkpoint_path)

		# Reset the environment
		state = env.reset()
		state = state_process(sess, state_processor, state)
		loss = None

		# One step in the environment
		for t in itertools.count():
			
			# Epsilon for this time step
			epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]

			# Add epsilon to Tensorboard
			episode_summary = tf.Summary()
			episode_summary.value.add(simple_value=epsilon, tag="epsilon")
			q_estimator.summary_writer.add_summary(episode_summary, total_t)

			###########################################################
			# YOUR CODE 2: Target network update
			# Hints : use function  "copy_model_parameters"
			if total_t % update_target_estimator_every == 0:
				copy_model_parameters(sess, q_estimator, target_estimator)

			# Print out which step we're on, useful for debugging.
			print("\rStep {} ({}) @ Episode {}/{}, loss: {}  Memory Len {} ".format(
					t, total_t, i_episode + 1, num_episodes, loss, len(replay_memory)), end="")
			sys.stdout.flush()

			##############################################
			# YOUR CODE 3: Take a step in the environment
			# Hints 1 :  be careful to use function 'state_process' to deal the RPG state
			# Hints 2 :  you can see function "populate_replay_buffer()" 
			#				for detail about how to TAKE A STEP 
			# about 2 or 3 line codes
			action = np.random.choice(len(VALID_ACTIONS), p=policy(sess, state, epsilon))
			next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
			next_state = state_processor.process(sess, next_state)
			next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)

			# If our replay memory is full, pop the first element
			if len(replay_memory) == replay_memory_size:
				replay_memory.pop(0)

			#############################
			# YOUR CODE 4: Save transition to replay memory
			#  Hints : you can see function 'populate_replay_buffer' for detail
			# about 1 or 2 line codes
			replay_memory.append( Transition( state, action, reward, next_state, done ) )
			

			# Update statistics
			stats.episode_rewards[i_episode] += reward
			stats.episode_lengths[i_episode] = t

			#########################################################
			# YOUR CODE 5: Sample a minibatch from the replay memory, 
			# hints: can use function "random.sample( replay_memory, batch_size )" to get minibatch
			# about 1-2 lines codes
			minibatch = np.array(random.sample(replay_memory, batch_size))
			state_batch, action_batch, reward_batch, next_state_batch, done_batch = map(np.array, zip(*minibatch))


			###########################################################
			# YOUR CODE 6: use minibatch sample to calculate q values and targets
			# Hints 1 : use function 'q_estimator.predict' to get q values
			# Hints 2 : use function 'target_estimator.predict' to get targets values
			#				remember 'targets = reward + gamma * max q( s, a' )'
			# about 2 line codes
			
			q_next = q_estimator.predict(sess,next_state_batch)
			a_max = np.argmax(q_next,axis=1)
			q_target = target_estimator.predict(sess,next_state_batch)
			
			done_batch = np.invert(done_batch).astype('float32')
			targets = reward_batch + done_batch * discount_factor * q_target[np.arange(batch_size),a_max]

			#print(done_batch,targets,q_target[np.arange(batch_size),a_max])

			################################################
			# YOUR CODE 7: Perform gradient descent update
			# hints : use function 'q_estimator.update'
			# about 1 line code
			loss = q_estimator.update(sess,state_batch, np.array(action_batch), targets)
			if done:
				break
			state = next_state
			total_t += 1

		# Add summaries to tensorboard
		episode_summary = tf.Summary()
		episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward")
		episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length")
		q_estimator.summary_writer.add_summary(episode_summary, total_t)
		q_estimator.summary_writer.flush()

		yield total_t, plotting.EpisodeStats(
			episode_lengths=stats.episode_lengths[:i_episode+1],
			episode_rewards=stats.episode_rewards[:i_episode+1])

	env.close()
	return stats
示例#36
0
def main(args):

    global RENDER_DELAY

    env_name = args.env_name
    filename = args.filename
    the_seed = args.seed
    final_mode = args.final_mode
    generate_data_mode = args.generate_data_mode
    render_mode = args.render_mode
    record_video = args.record_video
    max_length = args.max_length

    if env_name.startswith("bullet"):
        RENDER_DELAY = True

    use_model = False

    model = make_model()
    print('model size', model.param_count)

    model.make_env(env_name, render_mode=render_mode)

    if len(filename) > 0:
        model.load_model(filename)
    else:
        params = model.get_random_model_params(stdev=0.1)
        model.set_model_params(params)

    if final_mode:
        total_reward = 0.0
        np.random.seed(the_seed)
        model.env.seed(the_seed)

        for i in range(100):
            reward, steps_taken = simulate(model,
                                           train_mode=False,
                                           render_mode=False,
                                           num_episode=1,
                                           max_len=max_length,
                                           generate_data_mode=False)
            total_reward += reward[0]
            print("episode", i, "reward =", reward[0])
        print("seed", the_seed, "average_reward", total_reward / 100)
    else:
        if record_video:
            model.env = Monitor(model.env,
                                directory='./videos',
                                video_callable=lambda episode_id: True,
                                write_upon_reset=True,
                                force=True)
        while (5):
            reward, steps_taken = simulate(
                model,
                train_mode=False,
                render_mode=render_mode,
                num_episode=1,
                max_len=max_length,
                generate_data_mode=generate_data_mode)
            print("terminal reward", reward, "average steps taken",
                  np.mean(steps_taken) + 1)
示例#37
0
def batch_evaluate(agent,
                   env_name,
                   seed,
                   episodes,
                   return_obss_actions=False,
                   pixel=False,
                   monitor_gym=False,
                   pairs_dict=None,
                   model_path=None):

    num_envs = min(256, episodes)

    envs = []
    for i in range(num_envs):
        if '_c' in env_name:
            env = gym.make(env_name, pairs_dict=pairs_dict, test_mode=True)
        else:
            env = gym.make(env_name)
        if pixel:
            env = RGBImgPartialObsWrapper(env)

        if monitor_gym:
            demo_path = os.path.join(model_path, 'gym_demos')

            if not i % 64:
                env = Monitor(
                    env,
                    demo_path,
                    video_callable=lambda episode_id: episode_id == 1,
                    force=True)
            else:
                env = Monitor(env, demo_path, video_callable=False, force=True)

        envs.append(env)
    env = ManyEnvs(envs)

    logs = {
        "num_frames_per_episode": [],
        "return_per_episode": [],
        "observations_per_episode": [],
        "actions_per_episode": [],
        "seed_per_episode": [],
        "seen_missions": [env.mission for env in envs]
    }

    for i in range((episodes + num_envs - 1) // num_envs):
        seeds = range(seed + i * num_envs, seed + (i + 1) * num_envs)
        env.seed(seeds)

        many_obs = env.reset()

        cur_num_frames = 0
        num_frames = np.zeros((num_envs, ), dtype='int64')
        returns = np.zeros((num_envs, ))
        already_done = np.zeros((num_envs, ), dtype='bool')
        if return_obss_actions:
            obss = [[] for _ in range(num_envs)]
            actions = [[] for _ in range(num_envs)]
        while (num_frames == 0).any():
            action = agent.act_batch(many_obs)['action']
            if return_obss_actions:
                for i in range(num_envs):
                    if not already_done[i]:
                        obss[i].append(many_obs[i])
                        actions[i].append(action[i].item())
            many_obs, reward, done, _ = env.step(action)
            agent.analyze_feedback(reward, done)
            done = np.array(done)
            just_done = done & (~already_done)
            returns += reward * just_done
            cur_num_frames += 1
            num_frames[just_done] = cur_num_frames
            already_done[done] = True

        logs["num_frames_per_episode"].extend(list(num_frames))
        logs["return_per_episode"].extend(list(returns))
        logs["seed_per_episode"].extend(list(seeds))
        if return_obss_actions:
            logs["observations_per_episode"].extend(obss)
            logs["actions_per_episode"].extend(actions)

    return logs
示例#38
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Gamma discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    Transition = namedtuple(
        "Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_memory = []

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        action_probs = policy(sess, state,
                              epsilons[min(total_t, epsilon_decay_steps - 1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        next_state = state_processor.process(sess, next_state)
        next_state = np.append(state[:, :, 1:],
                               np.expand_dims(next_state, 2),
                               axis=2)
        replay_memory.append(
            Transition(state, action, reward, next_state, done))
        if done:
            state = env.reset()
            state = state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state

    # Record videos
    # Use the gym env Monitor wrapper
    env = Monitor(env,
                  directory=monitor_path,
                  resume=True,
                  video_callable=lambda count: count % record_video_every == 0)

    for i_episode in range(num_episodes):

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = state_processor.process(sess, state)
        state = np.stack([state] * 4, axis=2)
        loss = None

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)

            # Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)
                print("\nCopied model parameters to target network.")

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                t, total_t, i_episode + 1, num_episodes, loss),
                  end="")
            sys.stdout.flush()

            # Take a step
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_processor.process(sess, next_state)
            next_state = np.append(state[:, :, 1:],
                                   np.expand_dims(next_state, 2),
                                   axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            # Save transition to replay memory
            replay_memory.append(
                Transition(state, action, reward, next_state, done))

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(
                np.array, zip(*samples))

            # Calculate q values and targets (Double DQN)
            q_values_next = q_estimator.predict(sess, next_states_batch)
            best_actions = np.argmax(q_values_next, axis=1)
            q_values_next_target = target_estimator.predict(
                sess, next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \
                discount_factor * q_values_next_target[np.arange(batch_size), best_actions]

            # Perform gradient descent update
            states_batch = np.array(states_batch)
            loss = q_estimator.update(sess, states_batch, action_batch,
                                      targets_batch)

            if done:
                break

            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(
            simple_value=stats.episode_rewards[i_episode],
            node_name="episode_reward",
            tag="episode_reward")
        episode_summary.value.add(
            simple_value=stats.episode_lengths[i_episode],
            node_name="episode_length",
            tag="episode_length")
        q_estimator.summary_writer.add_summary(episode_summary, total_t)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode + 1],
            episode_rewards=stats.episode_rewards[:i_episode + 1])

    env.monitor.close()
    return stats
示例#39
0
        die = done
    life = info['ale.lives']
    # If our replay memory is full, pop the first element
    replay_memory.append(Transition(state, action, reward, next_state, die))
    if done:
        state = env.reset()
        state = pre_proc(state)
        state = np.stack([state] * 4, axis=2)
        life = 0
    else:
        state = next_state
print('Initialize replay buffer: done!')

# Record videos
env = Monitor(env,
              directory=monitor_path,
              resume=True,
              video_callable=lambda count: count % record_video_every == 0)
total_t = 0
for i_episode in range(num_episodes):
    loss = None
    state = env.reset()
    state = pre_proc(state)
    state = np.stack([state] * 4, axis=2)
    life = 0
    # One step in the environment
    for t in itertools.count():
        # Choose random action if not yet start learning
        epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]
        action = select_epilson_greedy_action(q_estimator, state, epsilon)

        next_state, reward, done, info = env.step(action)
示例#40
0
def make_training_env(cube_goal_pose,
                      goal_difficulty,
                      action_space,
                      frameskip=1,
                      sim=False,
                      visualization=False,
                      reward_fn=None,
                      termination_fn=None,
                      initializer=None,
                      episode_length=120000,
                      residual=False,
                      rank=0,
                      monitor=False):
    is_level_4 = goal_difficulty == 4
    reward_fn = get_reward_fn(reward_fn)
    initializer = get_initializer(initializer)
    termination_fn = get_termination_fn(termination_fn)
    if action_space not in [
            'torque', 'position', 'torque_and_position', 'position_and_torque'
    ]:
        raise ValueError(f"Unknown action space: {action_space}.")
    if action_space == 'torque':
        action_type = ActionType.TORQUE
    elif action_space in ['torque_and_position', 'position_and_torque']:
        action_type = ActionType.TORQUE_AND_POSITION
    else:
        action_type = ActionType.POSITION
    env = RealRobotCubeEnv(cube_goal_pose,
                           goal_difficulty,
                           action_type=action_type,
                           frameskip=frameskip,
                           sim=sim,
                           visualization=visualization,
                           reward_fn=reward_fn,
                           termination_fn=termination_fn,
                           initializer=initializer,
                           episode_length=episode_length)
    env.seed(seed=rank)
    env.action_space.seed(seed=rank)
    if visualization:
        env = PyBulletClearGUIWrapper(env)
    if monitor:
        from gym.wrappers import Monitor
        from code.const import TMP_VIDEO_DIR
        env = Monitor(RenderWrapper(env),
                      TMP_VIDEO_DIR,
                      video_callable=lambda episode_id: True,
                      mode='evaluation')
    if residual:
        if action_space == 'torque':
            # env = JointConfInitializationWrapper(env, heuristic=grasp)
            env = ResidualLearningFCWrapper(env,
                                            apply_torques=is_level_4,
                                            is_level_4=is_level_4)
        elif action_space == 'torque_and_position':
            env = ResidualLearningMotionPlanningFCWrapper(
                env,
                apply_torques=is_level_4,
                action_repeat=2,
                align_goal_ori=is_level_4,
                use_rrt=is_level_4,
                init_cube_manip='flip_and_grasp' if is_level_4 else 'grasp',
                evaluation=False)
        else:
            raise ValueError(f"Can't do residual learning with {action_space}")
    env = FlatObservationWrapper(env)
    return env
                break
        strategy.update_step()


        scores_window.append(score)
        mean_score_window = np.mean(scores_window)
        scores.append(mean_score_window)
        episodes.set_description('Average Score = {:.2f}\tExploration rate = {:.2f}'.format(mean_score_window, strategy.get_exploration_rate(False)))
except:
    pass

print('\nSaving the model checkpoint as {}'.format(path))
saveModel(model, path)
scores = np.asarray(scores)
np.save('scores.npy', scores)
plt.plot(scores)
plt.show()
'''
env = Monitor(env,
              './video',
              video_callable=lambda episode_id: True,
              force=True)
state = env.reset()
while True:
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)
    agent.step(state, action, reward, next_state, done)
    state = next_state
    if done: break

env.close()
示例#42
0
state = torch.cat(tuple([state] * 4), dim=1)


def get_action():
    sample = random.random()
    epsilon = 0.05
    if sample > epsilon:
        with torch.no_grad():
            return (policy_net(state.to(device)).max(1)[1].data[0]).to(
                torch.device("cpu"))
    else:
        return random.randrange(4)


env = Monitor(env,
              directory=monitor_path,
              video_callable=lambda count: count % 50 == 0,
              resume=True)
for i in [6]:
    print("Loading Checkpoint from dqn{}.model".format(i))
    checkpoint = torch.load("dqn{}.model".format(i))
    episode = checkpoint['episode']
    policy_net.load_state_dict(checkpoint['state_dict'])
    for i_episode in range(200):
        state = env.reset()
        state = process(state)
        state = torch.cat(tuple([state] * 4), dim=1)
        episode_reward = 0

        for t in count():
            action = get_action()
            next_state, reward, done, _ = env.step(action)
            action_state_value[[indices], [actions]] = next_action_state_value

            #

            self.model.fit(states, action_state_value, epochs=1, verbose=0)


####################################################################################################
# Run

File_Epsilon = open(str(FILE_EPSILON), 'a+')
File_Rewards = open(str(FILE_REWARDS), 'a+')

env = gym.make('LunarLander-v2')
if RECORD == True:
    env = Monitor(env=env, directory=PATH_VIDEO, force=True)
env.seed(0)

action_space = env.action_space.n
state_space = env.observation_space.shape[0]
agent = Agent(action_space, state_space)
if path.exists(PATH_WEIGHTS):
    agent.model.load_weights(PATH_WEIGHTS)

rewards = []

for episode in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, (1, state_space))

    score = 0
示例#44
0
num_steps = 0
state = env.reset()
state = process(state)
state = torch.cat(tuple([state] * 4), dim=1)

def get_action():
    sample = random.random()
    epsilon = 0.05
    if sample > epsilon:
        with torch.no_grad():
            return (policy_net(state.to(device)).max(1)[1].data[0]).to(torch.device("cpu"))
    else:
        return random.randrange(4)
    

env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % 50 == 0, resume=True)
for i in [6]:
    print("Loading Checkpoint from dqn{}.model".format(i))
    checkpoint = torch.load("dqn{}.model".format(i))
    episode = checkpoint['episode']
    policy_net.load_state_dict(checkpoint['state_dict'])
    for i_episode in range(200):
        state = env.reset()
        state = process(state)
        state = torch.cat(tuple([state] * 4), dim=1)
        episode_reward = 0
           
        for t in count():         
            action = get_action()
            next_state, reward, done, _ = env.step(action)
            num_steps+=1
示例#45
0
 def wrap_env(self, env: gym.Env):
     global logger
     logger = logging.getLogger(__name__)
     args = p.parse_args()
     if args.artificial_timelimit:
         logger.info('Wrapping with Timelimit')
         env = TimeLimit(env, max_episode_steps=args.artificial_timelimit)
     if not args.no_monitor:
         env = Monitor(
             env,
             osp.join(self.manager.logdir, 'openai_monitor'),
             video_callable=lambda ep_id: capped_quadratic_video_schedule(
                 ep_id, args.monitor_video_freq),
             force=True,
             mode='evaluation' if args.eval_mode else 'training')
     if '-ramNoFrameskip-v4' in self.manager.env_id:  # for playing atari from ram
         logger.info('Atari RAM env detected')
         logger.info('Wrapping with Fire Reset')
         env = FireResetEnv(env)
         if args.atari_episodic_life:
             logger.info('Wrapping with EpisodicLife')
             env = EpisodicLifeEnv(env)
         logger.info('Wrapping with NoopReset')
         env = NoopResetEnv(env, noop_max=args.atari_noop_max)
         logger.info('Wrapping with Frameskip')
         env = FrameSkipWrapper(env, skip=args.atari_frameskip)
         if args.framestack > 1:
             logger.info('Wrapping with Framestack')
             env = LinearFrameStackWrapper(env, k=args.framestack)
         if args.atari_clip_rewards:
             logger.info('Wrapping with ClipRewards')
             env = ClipRewardEnv(env)
         self.frameskip = args.atari_frameskip
         self.framestack = args.framestack
     # Some Image obs environment
     elif isinstance(
             env.observation_space,
             gym.spaces.Box) and len(env.observation_space.shape) >= 2:
         if 'NoFrameskip-v4' in self.manager.env_id:
             logger.info('Atari env detected')
             logger.info('Wrapping with Fire Reset')
             env = FireResetEnv(env)
             logger.info('Wrapping with AtariPreprocessing')
             env = AtariPreprocessing(
                 env,
                 noop_max=args.atari_noop_max,
                 frame_skip=args.atari_frameskip,
                 terminal_on_life_loss=args.atari_episodic_life)
             logger.info('Wrapping with Framestack')
             env = FrameStack(env, args.atari_framestack)
             if args.atari_clip_rewards:
                 logger.info('Wrapping with ClipRewards')
                 env = ClipRewardEnv(env)
             self.frameskip = args.atari_frameskip
             self.framestack = args.atari_framestack
         else:
             logger.info('Some image based env detected')
             if args.frameskip > 1:
                 logger.info('Wrapping with Frameskip')
                 env = FrameSkipWrapper(env, skip=args.frameskip)
             if args.framestack > 1:
                 logger.info('Wrapping with Framestack')
                 env = FrameStack(env, args.framestack)
             self.frameskip = args.frameskip
             self.framestack = args.framestack
     else:
         if args.frameskip > 1:
             logger.info('Wrapping with Frameskip')
             env = FrameSkipWrapper(env, skip=args.frameskip)
         if args.framestack > 1:
             logger.info('Wrapping with Framestack')
             env = LinearFrameStackWrapper(env, k=args.framestack)
         self.frameskip = args.frameskip
         self.framestack = args.framestack
     return env
示例#46
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Gamma discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_memory = []

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(
        q_estimator,
        len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        next_state = state_processor.process(sess, next_state)
        next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
        replay_memory.append(Transition(state, action, reward, next_state, done))
        if done:
            state = env.reset()
            state = state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state

    # Record videos
    # Use the gym env Monitor wrapper
    env = Monitor(env,
                  directory=monitor_path,
                  resume=True,
                  video_callable=lambda count: count % record_video_every ==0)

    for i_episode in range(num_episodes):

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = state_processor.process(sess, state)
        state = np.stack([state] * 4, axis=2)
        loss = None

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]

            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)

            # Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)
                print("\nCopied model parameters to target network.")

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                    t, total_t, i_episode + 1, num_episodes, loss), end="")
            sys.stdout.flush()

            # Take a step
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_processor.process(sess, next_state)
            next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            # Save transition to replay memory
            replay_memory.append(Transition(state, action, reward, next_state, done))   

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))

            # Calculate q values and targets (Double DQN)
            q_values_next = q_estimator.predict(sess, next_states_batch)
            best_actions = np.argmax(q_values_next, axis=1)
            q_values_next_target = target_estimator.predict(sess, next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \
                discount_factor * q_values_next_target[np.arange(batch_size), best_actions]

            # Perform gradient descent update
            states_batch = np.array(states_batch)
            loss = q_estimator.update(sess, states_batch, action_batch, targets_batch)

            if done:
                break

            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward")
        episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length")
        q_estimator.summary_writer.add_summary(episode_summary, total_t)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode+1],
            episode_rewards=stats.episode_rewards[:i_episode+1])

    env.monitor.close()
    return stats
示例#47
0
def main():
    startTime = time.time()
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    results_file = open("Results12.csv", 'a')
    agent = DDPG(env, results_file)
    env = Monitor(env, directory='experiments/' + ENV_NAME, force=True)
    results_file.write("Episodes Spent Training; " + str(TEST) +
                       " Episode Eval Avg \n")
    for episode in range(EPISODES):
        state = env.reset()
        if (episode % 20 == 0):
            print("episode:", episode)
        # Train
        for step in range(env.spec.timestep_limit):
            action = agent.noise_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        # Testing:
        if (episode + 1) % 100 == 0 and episode > 100:
            total_reward = 0
            for i in range(TEST):
                state = env.reset()
                for j in range(env.spec.timestep_limit):
                    env.render()
                    action = agent.action(state)  # direct action for test
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            ave_reward = total_reward / TEST
            print('episode: ', episode, 'Evaluation Average Reward:',
                  ave_reward)
            results_file.write(str(episode) + "; " + str(ave_reward) + "\n")

    results_file.write("Time Training (" + str(EPISODES) + "episodes);" +
                       str(time.time() - startTime) + "\n")
    results_file.write("Evaluation Episode; Reward \n")
    for episode in range(100):
        total_reward = 0
        env.reset()
        state = env.env.env.set_test(episode)
        for j in range(env.spec.timestep_limit):
            action = agent.action(state)  # direct action for test
            state, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                break
        results_file.write(str(episode) + "; " + str(total_reward) + "\n")
    results_file.write("endExperiment\n\n")
    results_file.close()
示例#48
0
        name=experiment_name,
        monitor_gym=True,
        save_code=True,
    )
    writer = SummaryWriter(f"/tmp/{experiment_name}")

# TRY NOT TO MODIFY: seeding
device = torch.device(
    "cuda" if torch.cuda.is_available() and args.cuda else "cpu")
env = gym.make(args.gym_id)
env = wrap_atari(env)
env = gym.wrappers.RecordEpisodeStatistics(
    env)  # records episode reward in `info['episode']['r']`
if args.capture_video:
    env = QValueVisualizationWrapper(env)
    env = Monitor(env, f"videos/{experiment_name}")
env = wrap_deepmind(
    env,
    clip_rewards=True,
    frame_stack=True,
    scale=False,
)
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic
env.seed(args.seed)
env.action_space.seed(args.seed)
env.observation_space.seed(args.seed)
# respect the default timelimit
assert isinstance(env.action_space,
示例#49
0
def wrap_env(env):
    env = Monitor(env, './video', force=True)
    return env
class PolicyMonitor(object):
  """
  Helps evaluating a policy by running an episode in an environment,
  saving a video, and plotting summaries to Tensorboard.

  Args:
    env: environment to run in
    policy_net: A policy estimator
    summary_writer: a tf.train.SummaryWriter used to write Tensorboard summaries
  """
  def __init__(self, env, policy_net, summary_writer, saver=None):

    self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos")
    self.video_dir = os.path.abspath(self.video_dir)

    self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True)
    self.global_policy_net = policy_net
    self.summary_writer = summary_writer
    self.saver = saver
    self.sp = StateProcessor()

    self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model"))

    try:
      os.makedirs(self.video_dir)
    except FileExistsError:
      pass

    # Local policy net
    with tf.variable_scope("policy_eval"):
      self.policy_net = PolicyEstimator(policy_net.num_outputs)

    # Op to copy params from global policy/value net parameters
    self.copy_params_op = make_copy_params_op(
      tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
      tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES))

  def _policy_net_predict(self, state, sess):
    feed_dict = { self.policy_net.states: [state] }
    preds = sess.run(self.policy_net.predictions, feed_dict)
    return preds["probs"][0]

  def eval_once(self, sess):
    with sess.as_default(), sess.graph.as_default():
      # Copy params to local model
      global_step, _ = sess.run([tf.contrib.framework.get_global_step(), self.copy_params_op])

      # Run an episode
      done = False
      state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset()))
      total_reward = 0.0
      episode_length = 0
      while not done:
        action_probs = self._policy_net_predict(state, sess)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = self.env.step(action)
        next_state = atari_helpers.atari_make_next_state(state, self.sp.process(next_state))
        total_reward += reward
        episode_length += 1
        state = next_state

      # Add summaries
      episode_summary = tf.Summary()
      episode_summary.value.add(simple_value=total_reward, tag="eval/total_reward")
      episode_summary.value.add(simple_value=episode_length, tag="eval/episode_length")
      self.summary_writer.add_summary(episode_summary, global_step)
      self.summary_writer.flush()

      if self.saver is not None:
        self.saver.save(sess, self.checkpoint_path)

      tf.logging.info("Eval results at step {}: total_reward {}, episode_length {}".format(global_step, total_reward, episode_length))

      return total_reward, episode_length

  def continuous_eval(self, eval_every, sess, coord):
    """
    Continuously evaluates the policy every [eval_every] seconds.
    """
    try:
      while not coord.should_stop():
        self.eval_once(sess)
        # Sleep until next evaluation cycle
        time.sleep(eval_every)
    except tf.errors.CancelledError:
      return
示例#51
0
 def convertir_env(self, env):
     self.env = Monitor(env, './video', force=True)
     return self.env
示例#52
0
class Environment(object):
    def __init__(self, game, record=False, width=84, height=84, seed=0):
        self.game = gym.make(game)
        self.game.seed(seed)

        if record:
            self.game = Monitor(self.game, './video', force=True)

        self.width = width
        self.height = height
        self._toTensor = T.Compose([T.ToPILImage(), T.ToTensor()])
        gym_ple

    def play_sample(self, mode: str = 'human'):
        observation = self.game.reset()

        while True:
            screen = self.game.render(mode=mode)
            if mode == 'rgb_array':
                screen = self.preprocess(screen)
            action = self.game.action_space.sample()
            observation, reward, done, info = self.game.step(action)
            if done:
                break
        self.game.close()

    def preprocess(self, screen):
        preprocessed: np.array = cv2.resize(
            screen, (self.height, self.width))  # 84 * 84 로 변경
        preprocessed = np.dot(preprocessed[..., :3],
                              [0.299, 0.587, 0.114])  # Gray scale 로 변경
        # preprocessed: np.array = preprocessed.transpose((2, 0, 1))  # (C, W, H) 로 변경
        preprocessed: np.array = preprocessed.astype('float32') / 255.

        return preprocessed

    def init(self):
        """
        @return observation
        """
        return self.game.reset()

    def get_screen(self):
        screen = self.game.render('rgb_array')
        screen = self.preprocess(screen)
        return screen

    def step(self, action: int):
        observation, reward, done, info = self.game.step(action)
        return observation, reward, done, info

    def reset(self):
        """
        :return: observation array
        """
        observation = self.game.reset()
        observation = self.preprocess(observation)
        return observation

    @property
    def action_space(self):
        return self.game.action_space.n
示例#53
0
 def set_video_dir(self, video_dir):
     self._env = Monitor(
         env=self._env,
         directory=video_dir,
         video_callable=lambda x: True
     )