Exemplo n.º 1
0
class Learner(object):
    def __init__(self, args):
        """ Construct a Learner from parsed arguments
        """
        self.total_timesteps = 0
        self.total_episodes = 0
        self._datetime = datetime.datetime.now()

        self._render = args.render
        self._learn_loops = args.loops
        self._learn_freq = args.erfreq
        self._offpolicy_noise = args.offpolicy_noise
        self._temp = float(args.temp)
        self._retro = args.retro
        self._filter_actions = args.filter_actions

        # Make environment
        if args.retro:
            import retro

            self._env = retro.make(game=args.env)
        else:
            self._env = gym.make(args.env)

        # Observations
        self._discrete_obs = isinstance(self._env.observation_space,
                                        gym.spaces.Discrete)

        if self._discrete_obs:
            self._state_vars = self._env.observation_space.n  # Prepare for one-hot encoding
        else:
            self._state_vars = int(
                np.product(self._env.observation_space.shape))

        # Primitive actions
        aspace = self._env.action_space

        if isinstance(aspace, gym.spaces.Tuple):
            aspace = aspace.spaces
        else:
            aspace = [
                aspace
            ]  # Ensure that the action space is a list for all the environments

        if isinstance(aspace[0], gym.spaces.Discrete):
            # Discrete actions
            self._num_actions = int(np.prod([a.n for a in aspace]))
        elif isinstance(aspace[0], gym.spaces.MultiBinary):
            # Retro actions are binary vectors of pressed buttons. Quick HACK,
            # only press one button at a time
            self._num_actions = int(np.prod([a.n for a in aspace]))
        else:
            # Continuous actions
            raise NotImplementedError('Continuous actions are not supported')

        self._aspace = aspace

        # BDPI algorithm instance
        self._bdpi = BDPI(self._state_vars, self._num_actions, args, None)

        # Summary
        print('Number of primitive actions:', self._num_actions)
        print('Number of state variables', self._state_vars)

    def loadstore(self, filename, load=True):
        """ Load or store weights from/to a file
        """
        self._bdpi.loadstore(filename, load)

    def encode_state(self, state):
        """ Encode a raw state from Gym to a Numpy vector
        """
        if self._discrete_obs:
            # One-hot encode discrete variables
            rs = np.zeros(shape=(self._state_vars, ), dtype=np.float32)
            rs[state] = 1.0
        elif isinstance(state, np.ndarray):
            rs = state.flatten().astype(np.float32)
        else:
            rs = np.array(state, dtype=np.float32)

        return rs

    def reset(self, last_reward):
        self._last_experience = None
        self._first_experience = None
        self._bdpi.reset(last_reward)

        self.total_episodes += 1

    def save_episode(self, name):
        states = []
        actions = []
        rewards = []
        entropies = []

        e = self._first_experience

        while e:
            states.append(e.state())
            actions.append(e.action)
            rewards.append(e.reward)
            entropies.append(e.entropy)

            e = e.next_experience

        s = pickle.dumps((states, actions, rewards, entropies))
        s = lzo.compress(s)
        f = open(name + '.episode', 'wb')
        f.write(s)
        f.close()

    def execute(self, env_state, f_probs, f_probs_n, f_actions, f_Q_values):
        """ Execute one episode in the environment.
        """

        done = False
        cumulative_reward = 0.0
        seen_reward = 0.0
        i = 0
        show_actions = random.random() < 0.05

        while (not done) and (i < 300):

            # Select an action based on the current state
            self.total_timesteps += 1

            old_env_state = env_state
            state = self.encode_state(env_state)
            if self._filter_actions:
                possible_actions = label_decoder_reverse(
                    self._env._get_not_empty_tracks(0))
            else:
                possible_actions = list(range(self._num_actions))
            action, experience = self._bdpi.select_action(
                state, env_state, possible_actions, f_probs, f_probs_n,
                f_Q_values)
            # Change the action if off-policy noise is to be used
            if self._offpolicy_noise and random.random() < self._temp:
                if self._filter_actions:
                    possible_actions = label_decoder_reverse(
                        self._env._get_not_empty_tracks(0))
                    if 52 in possible_actions:
                        r = random.randrange(4)
                        if r == 0:
                            action = 52
                        else:
                            ind = random.randrange(len(possible_actions))
                            experience.action = possible_actions[ind]
                else:
                    action = random.randrange(self._num_actions)
                experience.action = action

            # Manage the experience chain
            if self._first_experience is None:
                self._first_experience = experience
            if self._last_experience is not None:
                self._last_experience.next_experience = experience

            self._last_experience = experience

            # Execute the action

            if len(self._aspace) > 1:
                # Choose each of the factored action depending on the composite action
                actions = [0] * len(self._aspace)

                for j in range(len(actions)):
                    actions[j] = action % self._aspace[j].n
                    action //= self._aspace[j].n

                env_state, reward, done, __ = self._env.step(actions)

            else:
                # Simple scalar action
                if self._retro:
                    # Binary action
                    a = np.zeros((self._num_actions, ), dtype=np.int8)
                    a[action] = 1
                    action = a
                env_state, reward, done, __ = self._env.step(action)

                if show_actions:
                    if i == 0:
                        print('start', file=f_actions)

                    if action != 52:
                        print(label_decoder(action).start_track,
                              'to',
                              label_decoder(action).end_track,
                              file=f_actions)
                    else:
                        print('wait', file=f_actions)
                    if reward > 3:
                        print('solved', file=f_actions)
                    if done:
                        print('done', file=f_actions)

            i += 1
            public_reward = reward

            # Render the environment if needed
            if self._render > 0 and self.total_episodes >= self._render:
                self._env.render()

            # Add the reward of the action
            experience.reward = reward
            cumulative_reward += public_reward
            seen_reward += experience.reward

            # Learn from the experience buffer
            if self._learn_freq == 0:
                do_learn = done
            else:
                do_learn = (self.total_timesteps % self._learn_freq == 0)

            if do_learn:
                s = datetime.datetime.now()
                d = (s - self._datetime).total_seconds()
                #print('Start Learning, in-between is %.3f seconds...' % d)

                count = self._bdpi.train(f_Q_values)
                ns = datetime.datetime.now()
                d = (ns - s).total_seconds()
                ##                try:
                ##                    print('Learned %i steps in %.3f seconds, %.2f timesteps per second' % (count, d, count / d))
                ##                    print('S', count / d, file=sys.stderr)
                ##                except ZeroDivisionError:
                ##                    pass
                sys.stderr.flush()
                sys.stdout.flush()
                self._datetime = ns

        return (env_state, cumulative_reward, seen_reward, done, i)
Exemplo n.º 2
0
    def __init__(self, args):
        """ Construct a Learner from parsed arguments
        """
        self.total_timesteps = 0
        self.total_episodes = 0
        self._datetime = datetime.datetime.now()

        self._render = args.render
        self._learn_loops = args.loops
        self._learn_freq = args.erfreq
        self._offpolicy_noise = args.offpolicy_noise
        self._temp = float(args.temp)
        self._retro = args.retro
        self._filter_actions = args.filter_actions

        # Make environment
        if args.retro:
            import retro

            self._env = retro.make(game=args.env)
        else:
            self._env = gym.make(args.env)

        # Observations
        self._discrete_obs = isinstance(self._env.observation_space,
                                        gym.spaces.Discrete)

        if self._discrete_obs:
            self._state_vars = self._env.observation_space.n  # Prepare for one-hot encoding
        else:
            self._state_vars = int(
                np.product(self._env.observation_space.shape))

        # Primitive actions
        aspace = self._env.action_space

        if isinstance(aspace, gym.spaces.Tuple):
            aspace = aspace.spaces
        else:
            aspace = [
                aspace
            ]  # Ensure that the action space is a list for all the environments

        if isinstance(aspace[0], gym.spaces.Discrete):
            # Discrete actions
            self._num_actions = int(np.prod([a.n for a in aspace]))
        elif isinstance(aspace[0], gym.spaces.MultiBinary):
            # Retro actions are binary vectors of pressed buttons. Quick HACK,
            # only press one button at a time
            self._num_actions = int(np.prod([a.n for a in aspace]))
        else:
            # Continuous actions
            raise NotImplementedError('Continuous actions are not supported')

        self._aspace = aspace

        # BDPI algorithm instance
        self._bdpi = BDPI(self._state_vars, self._num_actions, args, None)

        # Summary
        print('Number of primitive actions:', self._num_actions)
        print('Number of state variables', self._state_vars)
Exemplo n.º 3
0
    def __init__(self, args, task):
        """ Construct a Learner from parsed arguments
        """
        self.total_timesteps = 0
        self.total_episodes = 0
        self._datetime = datetime.datetime.now()

        self._async_actor = args.async_actor
        self._render = args.render
        self._learn_loops = args.loops
        self._learn_freq = args.erfreq
        self._atari = args.atari
        self._retro = args.retro
        self._offpolicy_noise = args.offpolicy_noise
        self._temp = float(args.temp.split('_')[0])
        self._task = task

        # Make environment

        self._env = gym.make('RPiLEDEnv-v0',
                             resizeCamImagePct=50,
                             ledHSVLower=np.array([0, 0, 252]),
                             ledHSVHigher=np.array([31, 9, 255]),
                             rPiIP='192.168.0.183',
                             rPiPort=50000,
                             episodeLength=100,
                             bullseye=10)

        #         callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-20, verbose=1)
        #
        #         eval_callback = EvalCallback(env, best_model_save_path='./logs/best',
        #                              log_path='./logs/', eval_freq=5000,
        #                              deterministic=True, render=False, callback_on_new_best=callback_on_best)
        #
        # # Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :(
        #         checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/',
        #                                          name_prefix='ppo1_model')
        #
        #         cb = CallbackList([checkpoint_callback, eval_callback])

        if isinstance(self._env.action_space, gym.spaces.Box):
            # Wrap continuous-action environments
            self._env = gym_envs.contwrapper.ContWrapper(self._env)

        # Observations
        ob = self._env.observation_space
        self._discrete_obs = isinstance(ob, gym.spaces.Discrete)

        if self._discrete_obs:
            self._state_shape = (ob.n, )  # Prepare for one-hot encoding
        else:
            self._state_shape = ob.shape

            if len(self._state_shape) > 1:
                # Fix 2D shape for PyTorch
                s = self._state_shape

                self._state_shape = (s[2], s[0], s[1])

        # Primitive actions
        aspace = self._env.action_space

        if isinstance(aspace, gym.spaces.Tuple):
            aspace = aspace.spaces
        else:
            aspace = [
                aspace
            ]  # Ensure that the action space is a list for all the environments

        if isinstance(aspace[0], gym.spaces.Discrete):
            # Discrete actions
            self._num_actions = int(np.prod([a.n for a in aspace]))
        elif isinstance(aspace[0], gym.spaces.MultiBinary):
            # Retro actions are binary vectors of pressed buttons. Quick HACK,
            # only press one button at a time
            self._num_actions = int(np.prod([a.n for a in aspace]))

        self._aspace = aspace

        # BDPI algorithm instance
        self._bdpi = BDPI(self._state_shape, self._num_actions, args)

        # Summary
        print('Number of primitive actions:', self._num_actions)
        print('State shape:', self._state_shape)
Exemplo n.º 4
0
class Learner(object):
    def __init__(self, args, task):
        """ Construct a Learner from parsed arguments
        """
        self.total_timesteps = 0
        self.total_episodes = 0
        self._datetime = datetime.datetime.now()

        self._async_actor = args.async_actor
        self._render = args.render
        self._learn_loops = args.loops
        self._learn_freq = args.erfreq
        self._atari = args.atari
        self._retro = args.retro
        self._offpolicy_noise = args.offpolicy_noise
        self._temp = float(args.temp.split('_')[0])
        self._task = task

        # Make environment

        self._env = gym.make('RPiLEDEnv-v0',
                             resizeCamImagePct=50,
                             ledHSVLower=np.array([0, 0, 252]),
                             ledHSVHigher=np.array([31, 9, 255]),
                             rPiIP='192.168.0.183',
                             rPiPort=50000,
                             episodeLength=100,
                             bullseye=10)

        #         callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-20, verbose=1)
        #
        #         eval_callback = EvalCallback(env, best_model_save_path='./logs/best',
        #                              log_path='./logs/', eval_freq=5000,
        #                              deterministic=True, render=False, callback_on_new_best=callback_on_best)
        #
        # # Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :(
        #         checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/',
        #                                          name_prefix='ppo1_model')
        #
        #         cb = CallbackList([checkpoint_callback, eval_callback])

        if isinstance(self._env.action_space, gym.spaces.Box):
            # Wrap continuous-action environments
            self._env = gym_envs.contwrapper.ContWrapper(self._env)

        # Observations
        ob = self._env.observation_space
        self._discrete_obs = isinstance(ob, gym.spaces.Discrete)

        if self._discrete_obs:
            self._state_shape = (ob.n, )  # Prepare for one-hot encoding
        else:
            self._state_shape = ob.shape

            if len(self._state_shape) > 1:
                # Fix 2D shape for PyTorch
                s = self._state_shape

                self._state_shape = (s[2], s[0], s[1])

        # Primitive actions
        aspace = self._env.action_space

        if isinstance(aspace, gym.spaces.Tuple):
            aspace = aspace.spaces
        else:
            aspace = [
                aspace
            ]  # Ensure that the action space is a list for all the environments

        if isinstance(aspace[0], gym.spaces.Discrete):
            # Discrete actions
            self._num_actions = int(np.prod([a.n for a in aspace]))
        elif isinstance(aspace[0], gym.spaces.MultiBinary):
            # Retro actions are binary vectors of pressed buttons. Quick HACK,
            # only press one button at a time
            self._num_actions = int(np.prod([a.n for a in aspace]))

        self._aspace = aspace

        # BDPI algorithm instance
        self._bdpi = BDPI(self._state_shape, self._num_actions, args)

        # Summary
        print('Number of primitive actions:', self._num_actions)
        print('State shape:', self._state_shape)

    def loadstore(self, filename, load=True):
        """ Load or store weights from/to a file
        """
        self._bdpi.loadstore(filename, load)

    def encode_state(self, state):
        """ Encode a raw state from Gym to a Numpy vector
        """
        if self._discrete_obs:
            # One-hot encode discrete variables
            rs = np.zeros(shape=self._state_shape, dtype=np.float32)
            rs[state] = 1.0
            return rs
        elif len(state.shape) > 1:
            # Atari, retro and other image-based are NHWC, PyTorch is NCHW
            return np.swapaxes(state, 2, 0)
        else:
            return np.asarray(state, dtype=np.float32)

    def reset(self, last_reward):
        self._last_experience = None
        self._first_experience = None
        self._bdpi.reset(last_reward)

        self.total_episodes += 1

    def save_episode(self, name):
        states = []
        actions = []
        rewards = []
        entropies = []

        e = self._first_experience
        index = self._bdpi._experiences.index(e)

        for e in list(self._bdpi._experiences)[index:]:
            states.append(e.state())
            actions.append(e.action)
            rewards.append(e.reward)
            entropies.append(e.entropy)

        with open(name + '.episode', 'wb') as f:
            f.write(
                lzo.compress(
                    pickle.dumps((states, actions, rewards, entropies))))

        with open('/tmp/' + name + '-buffer.picklez', 'wb') as f:
            f.write(lzo.compress(pickle.dumps(list(self._bdpi._experiences))))

    def execute(self, env_state):
        """ Execute one episode in the environment.
        """

        done = False
        cumulative_reward = 0.0
        i = 0

        while (not done) and (i < 108000):
            # Select an action based on the current state
            self.total_timesteps += 1

            old_env_state = env_state
            state = self.encode_state(env_state)

            action, experience = self._bdpi.select_action(state)

            # Change the action if off-policy noise is to be used
            if self._offpolicy_noise and random.random() < self._temp:
                action = random.randrange(self._num_actions)
                experience.action = action

            # Manage the experience chain
            if self._first_experience is None:
                self._first_experience = experience
            if self._last_experience is not None:
                self._last_experience.set_next(experience)

            self._last_experience = experience

            # Execute the action
            if len(self._aspace) > 1:
                # Choose each of the factored action depending on the composite action
                actions = [0] * len(self._aspace)

                for j in range(len(actions)):
                    actions[j] = action % self._aspace[j].n
                    action //= self._aspace[j].n

                env_state, reward, done, _ = self._env.step(actions)
            else:
                # Simple scalar action
                if self._retro:
                    # Binary action
                    a = np.zeros((self._num_actions, ), dtype=np.int8)
                    a[action] = 1
                    action = a

                env_state, reward, done, _ = self._env.step(action)

            i += 1

            # Render the environment if needed
            if self._render > 0 and self.total_episodes >= self._render:
                self._env.render()

            # Use the taskfile to modify reward and done
            additional_reward, additional_done = self._task(
                old_env_state, action, env_state)

            reward += additional_reward

            if additional_done is not None:
                done = additional_done

            # Add the reward of the action
            experience.reward = reward
            cumulative_reward += reward

            # Learn from the experience buffer
            if self._learn_freq == 0:
                do_learn = done
            else:
                do_learn = (self.total_timesteps % self._learn_freq == 0)

            if do_learn and not self._async_actor:
                s = datetime.datetime.now()
                d = (s - self._datetime).total_seconds()
                print('Start Learning, in-between is %.3f seconds...' % d)

                count = self._bdpi.train()
                ns = datetime.datetime.now()
                d = (ns - s).total_seconds()
                print(
                    'Learned %i steps in %.3f seconds, %.2f timesteps per second'
                    % (count, d, count / d))
                sys.stderr.flush()
                sys.stdout.flush()
                self._datetime = ns

        return (env_state, cumulative_reward, done, i)
Exemplo n.º 5
0
    def __init__(self, args, task):
        """ Construct a Learner from parsed arguments
        """
        self.total_timesteps = 0
        self.total_episodes = 0
        self._datetime = datetime.datetime.now()

        self._async_actor = args.async_actor
        self._render = args.render
        self._learn_loops = args.loops
        self._learn_freq = args.erfreq
        self._atari = args.atari
        self._retro = args.retro
        self._offpolicy_noise = args.offpolicy_noise
        self._temp = float(args.temp.split('_')[0])
        self._task = task

        # Make environment
        if args.retro:
            import retro

            self._env = retro.make(game=args.env)
        elif args.atari:
            self._env = make_atari(args.env)
            self._env = wrap_deepmind(self._env)
        else:
            self._env = gym.make(args.env)

        if isinstance(self._env.action_space, gym.spaces.Box):
            # Wrap continuous-action environments
            self._env = gym_envs.contwrapper.ContWrapper(self._env)

        # Observations
        ob = self._env.observation_space
        self._discrete_obs = isinstance(ob, gym.spaces.Discrete)

        if self._discrete_obs:
            self._state_shape = (ob.n, )  # Prepare for one-hot encoding
        else:
            self._state_shape = ob.shape

            if len(self._state_shape) > 1:
                # Fix 2D shape for PyTorch
                s = self._state_shape

                self._state_shape = (s[2], s[0], s[1])

        # Primitive actions
        aspace = self._env.action_space

        if isinstance(aspace, gym.spaces.Tuple):
            aspace = aspace.spaces
        else:
            aspace = [
                aspace
            ]  # Ensure that the action space is a list for all the environments

        if isinstance(aspace[0], gym.spaces.Discrete):
            # Discrete actions
            self._num_actions = int(np.prod([a.n for a in aspace]))
        elif isinstance(aspace[0], gym.spaces.MultiBinary):
            # Retro actions are binary vectors of pressed buttons. Quick HACK,
            # only press one button at a time
            self._num_actions = int(np.prod([a.n for a in aspace]))

        self._aspace = aspace

        # BDPI algorithm instance
        self._bdpi = BDPI(self._state_shape, self._num_actions, args)

        # Summary
        print('Number of primitive actions:', self._num_actions)
        print('State shape:', self._state_shape)
Exemplo n.º 6
0
    def __init__(self, args):
        """ Construct a Learner from parsed arguments
        """
        self.total_timesteps = 0
        self.total_episodes = 0
        self._datetime = datetime.datetime.now()

        self._render = args.render
        self._learn_loops = args.loops
        self._learn_freq = args.erfreq
        self._offpolicy_noise = args.offpolicy_noise
        self._temp = float(args.temp)
        self._retro = args.retro

        # Make environment
        if args.retro:
            import retro

            self._env = retro.make(game=args.env)
        else:
            self._env = gym.make(args.env)

        # Wrap Atari with the DeepMind cheats
        if hasattr(self._env, 'unwrapped') and isinstance(
                self._env.unwrapped, gym.envs.atari.atari_env.AtariEnv):
            assert 'NoFrameskip' in self._env.spec.id

            self._env = atariwrap.NoopResetEnv(self._env, noop_max=30)
            self._env = atariwrap.MaxAndSkipEnv(self._env, skip=4)
            self._env = atariwrap.wrap_deepmind(self._env)

        # Observations
        self._discrete_obs = isinstance(self._env.observation_space,
                                        gym.spaces.Discrete)

        if self._discrete_obs:
            self._state_shape = (self._env.observation_space.n,
                                 )  # Prepare for one-hot encoding
        else:
            self._state_shape = self._env.observation_space.shape

            if len(self._state_shape) > 1:
                # Fix 2D shape for PyTorch
                s = self._state_shape

                self._state_shape = (s[2], s[0], s[1])

        # Primitive actions
        aspace = self._env.action_space

        if isinstance(aspace, gym.spaces.Tuple):
            aspace = aspace.spaces
        else:
            aspace = [
                aspace
            ]  # Ensure that the action space is a list for all the environments

        if isinstance(aspace[0], gym.spaces.Discrete):
            # Discrete actions
            self._num_actions = int(np.prod([a.n for a in aspace]))
        elif isinstance(aspace[0], gym.spaces.MultiBinary):
            # Retro actions are binary vectors of pressed buttons. Quick HACK,
            # only press one button at a time
            self._num_actions = int(np.prod([a.n for a in aspace]))
        else:
            # Continuous actions
            print(aspace)
            raise NotImplementedError('Continuous actions are not supported')

        self._aspace = aspace

        # BDPI algorithm instance
        self._bdpi = BDPI(self._state_shape, self._num_actions, args, None)

        # Summary
        print('Number of primitive actions:', self._num_actions)
        print('State shape:', self._state_shape)
Exemplo n.º 7
0
class Learner(object):
    def __init__(self, args):
        """ Construct a Learner from parsed arguments
        """
        self.total_timesteps = 0
        self.total_episodes = 0
        self._datetime = datetime.datetime.now()

        self._render = args.render
        self._learn_loops = args.loops
        self._learn_freq = args.erfreq
        self._offpolicy_noise = args.offpolicy_noise
        self._temp = float(args.temp)
        self._retro = args.retro

        # Make environment
        if args.retro:
            import retro

            self._env = retro.make(game=args.env)
        else:
            self._env = gym.make(args.env)

        # Wrap Atari with the DeepMind cheats
        if hasattr(self._env, 'unwrapped') and isinstance(
                self._env.unwrapped, gym.envs.atari.atari_env.AtariEnv):
            assert 'NoFrameskip' in self._env.spec.id

            self._env = atariwrap.NoopResetEnv(self._env, noop_max=30)
            self._env = atariwrap.MaxAndSkipEnv(self._env, skip=4)
            self._env = atariwrap.wrap_deepmind(self._env)

        # Observations
        self._discrete_obs = isinstance(self._env.observation_space,
                                        gym.spaces.Discrete)

        if self._discrete_obs:
            self._state_shape = (self._env.observation_space.n,
                                 )  # Prepare for one-hot encoding
        else:
            self._state_shape = self._env.observation_space.shape

            if len(self._state_shape) > 1:
                # Fix 2D shape for PyTorch
                s = self._state_shape

                self._state_shape = (s[2], s[0], s[1])

        # Primitive actions
        aspace = self._env.action_space

        if isinstance(aspace, gym.spaces.Tuple):
            aspace = aspace.spaces
        else:
            aspace = [
                aspace
            ]  # Ensure that the action space is a list for all the environments

        if isinstance(aspace[0], gym.spaces.Discrete):
            # Discrete actions
            self._num_actions = int(np.prod([a.n for a in aspace]))
        elif isinstance(aspace[0], gym.spaces.MultiBinary):
            # Retro actions are binary vectors of pressed buttons. Quick HACK,
            # only press one button at a time
            self._num_actions = int(np.prod([a.n for a in aspace]))
        else:
            # Continuous actions
            print(aspace)
            raise NotImplementedError('Continuous actions are not supported')

        self._aspace = aspace

        # BDPI algorithm instance
        self._bdpi = BDPI(self._state_shape, self._num_actions, args, None)

        # Summary
        print('Number of primitive actions:', self._num_actions)
        print('State shape:', self._state_shape)

    def loadstore(self, filename, load=True):
        """ Load or store weights from/to a file
        """
        self._bdpi.loadstore(filename, load)

    def encode_state(self, state):
        """ Encode a raw state from Gym to a Numpy vector
        """
        if self._discrete_obs:
            # One-hot encode discrete variables
            rs = np.zeros(shape=self._state_shape, dtype=np.float32)
            rs[state] = 1.0
        elif len(self._state_shape) > 1:
            # Atari, retro and other image-based are NHWC, PyTorch is NCHW
            rs = np.float32(np.swapaxes(state, 2, 0))
        else:
            rs = np.asarray(state, dtype=np.float32)

        return rs

    def reset(self, last_reward):
        self._last_experience = None
        self._first_experience = None
        self._bdpi.reset(last_reward)

        self.total_episodes += 1

    def save_episode(self, name):
        states = []
        actions = []
        rewards = []
        entropies = []

        e = self._first_experience

        while e:
            states.append(e.state())
            actions.append(e.action)
            rewards.append(e.reward)
            entropies.append(e.entropy)

            e = e.next_experience

        s = pickle.dumps((states, actions, rewards, entropies))
        s = lzo.compress(s)
        f = open(name + '.episode', 'wb')
        f.write(s)
        f.close()

    def execute(self, env_state):
        """ Execute one episode in the environment.
        """

        done = False
        cumulative_reward = 0.0
        seen_reward = 0.0
        i = 0

        while (not done) and (i < 108000):
            # Select an action based on the current state
            self.total_timesteps += 1

            old_env_state = env_state
            state = self.encode_state(env_state)

            action, experience = self._bdpi.select_action(state, env_state)

            # Change the action if off-policy noise is to be used
            if random.random() < self._offpolicy_noise:
                action = random.randrange(self._num_actions)
                experience.action = action

            # Manage the experience chain
            if self._first_experience is None:
                self._first_experience = experience
            if self._last_experience is not None:
                self._last_experience.next_experience = experience

            self._last_experience = experience

            # Execute the action
            if len(self._aspace) > 1:
                # Choose each of the factored action depending on the composite action
                actions = [0] * len(self._aspace)

                for j in range(len(actions)):
                    actions[j] = action % self._aspace[j].n
                    action //= self._aspace[j].n

                env_state, reward, done, __ = self._env.step(actions)
            else:
                # Simple scalar action
                if self._retro:
                    # Binary action
                    a = np.zeros((self._num_actions, ), dtype=np.int8)
                    a[action] = 1
                    action = a

                env_state, reward, done, __ = self._env.step(action)

            i += 1
            public_reward = reward

            # Render the environment if needed
            if self._render > 0 and self.total_episodes >= self._render:
                self._env.render()

            # Add the reward of the action
            experience.reward = reward
            cumulative_reward += public_reward
            seen_reward += experience.reward

            # Learn from the experience buffer
            if self._learn_freq == 0:
                do_learn = done
            else:
                do_learn = (self.total_timesteps % self._learn_freq == 0)

            if do_learn:
                s = datetime.datetime.now()
                d = (s - self._datetime).total_seconds()
                print('Start Learning, in-between is %.3f seconds...' % d)

                count = self._bdpi.train()
                ns = datetime.datetime.now()
                d = (ns - s).total_seconds()
                print(
                    'Learned %i steps in %.3f seconds, %.2f timesteps per second'
                    % (count, d, count / d))
                print('S', count / d, file=sys.stderr)
                sys.stderr.flush()
                sys.stdout.flush()
                self._datetime = ns

        return (env_state, cumulative_reward, seen_reward, done, i)