Exemplo n.º 1
0
 def initialize_buffer(
     self, size, optim_keys=["state", "action", "reward", "state_new", "not_done"]
 ):
     self.buffer = Replay_buffer(size, optim_keys)
Exemplo n.º 2
0
class SampleManager:

    """
    @args:
        model: model Object, model: tf.keras.Model (or model imitating a tf model) returning dictionary with the possible keys: 'q_values' or 'policy' or 'mus' and 'sigmas' for continuous policies, optional 'value_estimate', containing tensors
        environment: string specifying gym environment or object of custom gym-like (implementing the same methods) environment
        num_parallel: int, number of how many agents to run in parall
        total_steps: int, how many steps to collect for the experience replay
        returns: list of strings specifying what is to be returned by the box
            supported are: 'value_estimate', 'log_prob', 'monte_carlo'
        actin_sampling_type: string, type of sampling actions, supported are 'epsilon_greedy', 'thompson', 'discrete_policy' or 'continuous_normal_diagonal'

    @kwargs:
        model_kwargs: dict, optional model initialization specifications
        weights: optional, weights which can be loaded into the agent for remote data collecting
        input_shape: shape or boolean (if shape not needed for first call of model), defaults shape of the environments reset state

        env_config: dict, opitonal configurations for environment creation if a custom environment is used

        num_episodes: specifies the total number of episodes to run on the environment for each runner, defaults to 1
        num_steps: specifies the total number of steps to run on the environment for each runner

        gamma: float, discount factor for monte carlo return, defaults to 0.99
        temperature: float, temperature for thomson sampling, defaults to 1
        epsilon: epsilon for epsilon greedy sampling, defaults to 0.95

        remote_min_returns: int, minimum number of remote runner results to wait for, defaults to 10% of num_parallel
        remote_time_out: float, maximum amount of time (in seconds) to wait on the remote runner results, defaults to None

        is_tf: boolean, if model is tensorflow model and neets initialization
    """

    def __init__(
        self, model, environment, num_parallel, total_steps, returns=[], **kwargs
    ):

        self.model = model
        self.environment = environment
        self.num_parallel = num_parallel
        self.total_steps = total_steps
        self.buffer = None

        # create gym / custom gym like environment
        if isinstance(self.environment, str):
            self.env_instance = gym.make(self.environment)
        else:
            env_kwargs = {}
            if "env_kwargs" in kwargs.keys():
                env_kwargs = kwargs["env_kwargs"]
                kwargs.pop("env_kwargs")
            self.env_instance = self.env_creator(self.environment, **env_kwargs)

        # specify input shape if not given
        if not ("input_shape" in kwargs):
            state = self.env_instance.reset()
            state = np.expand_dims(state, axis=0)
            kwargs["input_shape"] = state

        # if no model_kwargs given set to empty
        if not ("model_kwargs") in kwargs:
            kwargs["model_kwargs"] = {}
            if not ("is_tf") in kwargs:
                kwargs["is_tf"] = True
            else:
                kwargs["is_tf"] = False

        # initilize random weights if not given
        if not ("weights" in kwargs.keys()):
            random_weights = self.initialize_weights(
                self.model,
                kwargs["input_shape"],
                kwargs["model_kwargs"],
                kwargs["is_tf"],
            )
            kwargs["weights"] = random_weights

        kwargs["test"] = False
        kwargs["discrete_env"] = True
        self.discrete_env = True
        self.kwargs = kwargs
        ## some checkups

        assert self.num_parallel > 0, "num_parallel hast to be greater than 0!"

        self.kwargs["discrete_env"] = True
        # check action sampling type
        if "action_sampling_type" in kwargs.keys():
            type = kwargs["action_sampling_type"]
            if type not in [
                "thompson",
                "epsilon_greedy",
                "discrete_policy",
                "continuous_normal_diagonal",
            ]:
                print(
                    f"unsupported sampling type: {type}. assuming sampling from a discrete policy instead."
                )
                self.kwargs["action_sampling_type"] = "discrete_policy"
            if type == "continuous_normal_diagonal":
                self.discrete_env = False
                self.kwargs["discrete_env"] = False

        if not ("temperature" in self.kwargs.keys()):
            self.kwargs["temperature"] = 1
        if not ("epsilon" in self.kwargs.keys()):
            self.kwargs["epsilon"] = 0.95
        # chck return specifications
        for r in returns:
            if r not in ["log_prob", "monte_carlo", "value_estimate"]:
                print(f"unsuppoerted return key: {r}")
                returns.pop(r)
            if r == "value_estimate":
                self.kwargs["value_estimate"] = True
        self.returns = returns

        # check for runner sampling method:
        # error if both are specified
        self.run_episodes = True
        self.runner_steps = 1
        if "num_episodes" in kwargs.keys():
            self.runner_steps = kwargs["num_episodes"]
            if "num_steps" in kwargs.keys():
                print(
                    "Both episode mode and step mode for runner sampling are specified. Please only specify one."
                )
                raise ValueError
            self.kwargs.pop("num_episodes")
        elif "num_steps" in kwargs.keys():
            self.runner_steps = kwargs["num_steps"]
            self.run_episodes = False
            self.kwargs.pop("num_steps")

        # check for remote process specifications
        if "remote_min_returns" in kwargs.keys():
            self.remote_min_returns = kwargs["remote_min_returns"]
            self.kwargs.pop("remote_min_returns")
        else:
            # defaults to 10% of remote runners, but minimum 1
            self.remote_min_returns = max([int(0.1 * self.num_parallel), 1])

        if "remote_time_out" in kwargs.keys():
            self.remote_time_out = kwargs["remote_time_out"]
            self.kwargs.pop("remote_time_out")
        else:
            # defaults to None, i.e. wait for remote_min_returns to be returned irrespective of time
            self.remote_time_out = None

        self.reset_data()
        # # TODO: print info on setup values

    def reset_data(self):
        # initilize empty datasets aggregator
        self.data = {}
        self.data["action"] = []
        self.data["state"] = []
        self.data["reward"] = []
        self.data["state_new"] = []
        self.data["not_done"] = []
        for r in self.returns:
            self.data[r] = []

    def initialize_weights(self, model, input_dummy, model_kwargs, is_tf):
        model_inst = model(**model_kwargs)
        if is_tf:
            if hasattr(model, "tensorflow"):
                assert (
                    input_dummy != None
                ), 'You have a tensorflow model with no input shape specified for weight initialization. \n Specify input_shape in "model_kwargs" or specify as False if not needed'
            model_inst(input_dummy)
            weights = model_inst.get_weights()
        else:
            weights = model_inst.get_weights()

        return weights

    def get_data(self, do_print=False, total_steps=None):

        self.reset_data()
        if total_steps is not None:
            old_steps = self.total_steps
            self.total_steps = total_steps

        not_done = True
        # create list of runnor boxes
        runner_boxes = [
            RunnerBox.remote(
                Agent,
                self.model,
                self.env_instance,
                runner_position=i,
                returns=self.returns,
                **self.kwargs,
            )
            for i in range(self.num_parallel)
        ]
        t = 0

        # initial processes
        if self.run_episodes:
            runner_processes = [
                b.run_n_episodes.remote(self.runner_steps) for b in runner_boxes
            ]
        else:
            runner_processes = [
                b.run_n_steps.remote(self.runner_steps) for b in runner_boxes
            ]

        # run as long as not yet reached number of total steps
        while not_done:

            ready, remaining = ray.wait(
                runner_processes,
                num_returns=self.remote_min_returns,
                timeout=self.remote_time_out,
            )
            # boxes returns list of tuples (data_agg, index)
            returns = ray.get(ready)
            results = []
            indexes = []
            for r in returns:
                result, index = r
                results.append(result)
                indexes.append(index)

            # store data from dones
            if do_print:
                print(f"iteration: {t}, storing results of {len(results)} runners")
            not_done = self._store(results)
            # get boxes that are alreadey done
            accesed_mapping = map(runner_boxes.__getitem__, indexes)
            done_runners = list(accesed_mapping)
            # create new processes
            if self.run_episodes:
                new_processes = [
                    b.run_n_episodes.remote(self.runner_steps) for b in done_runners
                ]

            else:
                new_processes = [
                    b.run_n_steps.remote(self.runner_steps) for b in done_runners
                ]

            # concatenate old and new processes
            runner_processes = remaining + new_processes
            t += 1

        if total_steps is not None:
            self.total_steps = old_steps

        return self.data

    # stores results and asserts if we are done
    def _store(self, results):
        not_done = True
        # results is a list of dctinaries
        assert (
            self.data.keys() == results[0].keys()
        ), "data keys and return keys do not matach"

        for r in results:
            for k in self.data.keys():
                self.data[k].extend(r[k])

        # stop if enought data is aggregated
        if len(self.data["state"]) > self.total_steps:
            not_done = False

        return not_done

    def sample(self, sample_size, from_buffer=True):
        # sample from buffer
        if from_buffer:
            dict = self.buffer.sample(sample_size)
        else:

            dict = self.get_data(total_steps=sample_size)

        return dict

    def get_agent(self, test=False):

        if test:
            self.kwargs["test"] = True

        # get agent specifications from runner box
        runner_box = RunnerBox.remote(
            Agent,
            self.model,
            self.env_instance,
            runner_position=0,
            returns=self.returns,
            **self.kwargs,
        )
        agent_kwargs = ray.get(runner_box.get_agent_kwargs.remote())
        agent = Agent(self.model, **agent_kwargs)

        if test:
            self.kwargs["test"] = False

        return agent

    def set_agent(self, new_weights):
        self.kwargs["weights"] = new_weights

    def set_temperature(self, temperature):
        self.kwargs["temperature"] = temperature

    def set_epsilon(self, epsilon):
        self.kwargs["epsilon"] = epsilon

    def initialize_buffer(
        self, size, optim_keys=["state", "action", "reward", "state_new", "not_done"]
    ):
        self.buffer = Replay_buffer(size, optim_keys)

    def store_in_buffer(self, data_dict):
        self.buffer.put(data_dict)

    def test(
        self,
        max_steps,
        test_episodes=100,
        evaluation_measure="time",
        render=False,
        do_print=False,
    ):

        env = self.env_instance
        agent = self.get_agent(test=True)

        # get evaluation specs
        return_time = False
        return_reward = False

        if evaluation_measure == "time":
            return_time = True
            time_steps = []
        elif evaluation_measure == "reward":
            return_reward = True
            rewards = []
        elif evaluation_measure == "time_and_reward":
            return_time = True
            return_reward = True
            time_steps = []
            rewards = []
        else:
            print(
                f"unrceognized evaluation measure: {evaluation_measure} \n Change to 'time', 'reward' or 'time_and_reward'."
            )
            raise ValueError

        for e in range(test_episodes):
            state_new = np.expand_dims(env.reset(), axis=0)
            if return_reward:
                reward_per_episode = []

            for t in range(max_steps):
                if render:
                    env.render()
                state = state_new
                action = agent.act(state)
                # check if action is tf
                if tf.is_tensor(action):
                    action = action.numpy()
                if self.kwargs["discrete_env"]:
                    action = int(action)
                state_new, reward, done, info = env.step(action)
                state_new = np.expand_dims(state_new, axis=0)
                if return_reward:
                    reward_per_episode.append(reward)
                if done:
                    if return_time:
                        time_steps.append(t)
                    if return_reward:
                        rewards.append(np.mean(reward_per_episode))
                    break
                if t == max_steps - 1:
                    if return_time:
                        time_steps.append(t)
                    if return_reward:
                        rewards.append(np.mean(reward_per_episode))
                    break

        env.close()

        if return_time & return_reward:
            if do_print:
                print(
                    f"Episodes finished after a mean of {np.mean(time_steps)} timesteps"
                )
                print(
                    f"Episodes finished after a mean of {np.mean(rewards)} accumulated reward"
                )
            return time_steps, rewards
        elif return_time:
            if do_print:
                print(
                    f"Episodes finished after a mean of {np.mean(time_steps)} timesteps"
                )
            return time_steps
        elif return_reward:
            if do_print:
                print(
                    f"Episodes finished after a mean of {np.mean(rewards)} accumulated reward"
                )
            return rewards

    def initialize_aggregator(
        self, path, saving_after=10, aggregator_keys=["loss"], max_size=5, init_epoch=0
    ):
        self.agg = Smoothing_aggregator(
            path, saving_after, aggregator_keys, max_size, init_epoch
        )

    def update_aggregator(self, **kwargs):
        self.agg.update(**kwargs)

    def env_creator(self, object, **kwargs):
        return object(**kwargs)

    def set_env(self, env_kwargs):
        self.env_instance = self.env_creator(self.environment, **env_kwargs)

    def save_model(self, path, epoch, model_name="model"):
        time_stamp = datetime.now().strftime("%d-%m-%Y_%I-%M-%S_%p")
        full_path = f"{path}/{model_name}_{epoch}_{time_stamp}"
        agent = self.get_agent()
        print("saving model...")
        agent.model.save(full_path)

    def load_model(self, path, model_name=None):
        if model_name is not None:
            # # TODO:
            print("specific model loading not yet implemented")
        else:
            pass
        # alweys leads the latest model
        subdirs = all_subdirs_of(path)
        latest_subdir = max(subdirs, key=os.path.getmtime)
        print("loading model...")
        model = tf.keras.models.load_model(latest_subdir)
        weights = model.get_weights()
        self.set_agent(weights)
        agent = self.get_agent()
        return agent