예제 #1
0
파일: run_fqi.py 프로젝트: davide1096/ifqi
              state_dim=state_dim,
              action_dim=action_dim,
              discrete_actions=discrete_actions,
              gamma=config['fqi']['gamma'],
              horizon=config['fqi']['horizon'],
              verbose=config['fqi']['verbose'])
    fit_params = config['fit_params']

    if config['experiment_setting']['evaluation']['metric'] == 'n_episodes':
        for i in config['experiment_setting']['evaluation']['n_episodes']:
            episode_end_idxs = np.argwhere(dataset[:, -1] == 1).ravel()
            last_el = episode_end_idxs[i - 1]
            sast, r = split_data_for_fqi(dataset, state_dim, action_dim,
                                         reward_dim, last_el + 1)

            fqi.fit(sast, r, **fit_params)

            experiment_results.append(evaluate(mdp, fqi, mdp.initial_states, args))
        results.append(experiment_results)
    elif config['experiment_setting']['evaluation']['metric'] == 'fqi_iteration':
        sast, r = split_data_for_fqi(dataset, state_dim, action_dim, reward_dim)

        fqi.partial_fit(sast, r, **fit_params)

        for i in range(2, fqi.horizon + 1):
            fqi.partial_fit(None, None, **fit_params)

            if not i % config['experiment_setting']['evaluation']['n_steps_to_evaluate']:
                experiment_results.append(evaluate(mdp, fqi, mdp.initial_states, args))
        results.append(experiment_results)
    else:
예제 #2
0
class EpsilonFQI:
    def __init__(self, fqi, fe, epsilon=0.05):
        """
        Creates an epsilon-greedy policy from the given FQI policy object. 
        :param fqi: an FQI instance from the ifqi package
        :param fe: a feature extractor (method s_features(x) is expected)
        :param epsilon: exploration rate for the policy (0 <= epsilon <= 1)
        """
        self.epsilon = epsilon
        self.fqi = None
        self.actions = None
        self.fe = None

        self.load_fe(fe)

        if isinstance(fqi, dict):
            self.fqi = FQI(**fqi)
            self.actions = fqi['discrete_actions']
        else:
            self.load_fqi(fqi)

    def fit(self, sast, r, **kwargs):
        """
        Fits the policy on the given sast, r dataset, for the amounts of iterations
        defined when creating FQI.
        :param sast: a dataset of (state, action, state, terminal) transitions
        :param r: the rewards corresponding to the transitions in sast
        :param kwargs: additional arguments for the fit() function of FQI
        """
        self.fqi.fit(sast, r, **kwargs)

    def partial_fit(self, sast=None, r=None, **kwargs):
        """
        Fits the policy on the given sast, r dataset, for one iteration.
        :param sast: a dataset of (state, action, state, terminal) transitions
        :param r: the rewards corresponding to the transitions in sast
        :param kwargs: additional arguments for the partial_fit() function of FQI
        """
        self.fqi.partial_fit(sast, r, **kwargs)

    def draw_action(self,
                    state,
                    absorbing,
                    evaluation=False,
                    fully_deterministic=False):
        """
        Picks an action according to the epsilon-greedy choice
        :param state: a state
        :param absorbing: bool, whether the state is absorbing
        :param evaluation: bool, whether to use the epsilon defined for evaluation
        :param fully_deterministic: whether to use FQI, deterministically, to
        select the action
        :return: the selected action
        """
        if not fully_deterministic and random() <= self.epsilon:
            return choice(self.actions)
        else:
            preprocessed_state = self.fe.s_features(state)
            return self.fqi.draw_action(preprocessed_state,
                                        absorbing,
                                        evaluation=evaluation)

    def set_epsilon(self, epsilon):
        """
        :param epsilon: the exploration rate to use 
        """
        self.epsilon = epsilon

    def get_epsilon(self):
        """
        :return: the current exploration rate 
        """
        return self.epsilon

    def load_fqi(self, fqi):
        """
        Loads an FQI policy from file, sets the action space accordingly
        :param fqi: str or file-like object from which to load the policy
        """
        if isinstance(fqi, str):
            self.fqi = joblib.load(fqi)
        else:
            self.fqi = fqi
        # Set the correct action space
        self.actions = self.fqi._actions

    def save_fqi(self, filename):
        """
        Saves the FQI object to file
        :param filename: filename to which save the model
        """
        joblib.dump(self.fqi, filename)

    def load_fe(self, fe):
        """
        Loads the feature extractor from file
        :param fe: str or file-like object from which to load the model
        """
        if isinstance(fe, str):
            self.fe = joblib.load(fe)
            self.fe.load(fe)
        else:
            self.fe = fe

    def save_fe(self, filename):
        """
        Saves the feature extractor to file
        :param filename: filename to which save the model
        """
        if hasattr(self.fe, 'save'):
            self.fe.save(filename)
        else:
            joblib.dump(self.fe, filename)