Пример #1
0
    def fit(self, dataset):
        if not self._quiet:
            tqdm.write('Iteration ' + str(self._iter))

        x, u, r, xn, absorbing, last = parse_dataset(dataset)
        x = x.astype(np.float32)
        u = u.astype(np.float32)
        r = r.astype(np.float32)
        xn = xn.astype(np.float32)

        obs = to_float_tensor(x, self.policy.use_cuda)
        act = to_float_tensor(u, self.policy.use_cuda)
        v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda)
        np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8)
        adv = to_float_tensor(np_adv, self.policy.use_cuda)

        old_pol_dist = self.policy.distribution_t(obs)
        old_log_p = old_pol_dist.log_prob(act)[:, None].detach()

        self._V.fit(x, v_target, **self._critic_fit_params)

        self._update_policy(obs, act, adv, old_log_p)

        # Print fit information
        self._print_fit_info(dataset, x, v_target, old_pol_dist)
        self._iter += 1
Пример #2
0
def experiment(algorithm_class, exp):
    np.random.seed()

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialParameter(value=1,
                                   exp=.5,
                                   size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialParameter(value=1, exp=exp, size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(mdp.info, pi, **algorithm_params)

    # Algorithm
    start = mdp.convert_to_int(mdp._start, mdp._width)
    collect_max_Q = CollectMaxQ(agent.Q, start)
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset, collect_max_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get()

    return reward, max_Qs
Пример #3
0
    def fit(self, dataset):
        phi_state, action, reward, phi_next_state, absorbing, _ = parse_dataset(
            dataset, self.phi)
        phi_state_action = get_action_features(phi_state, action,
                                               self.mdp_info.action_space.n)

        norm = np.inf
        while norm > self._epsilon:
            q = self.approximator.predict(phi_next_state)
            if np.any(absorbing):
                q *= 1 - absorbing.reshape(-1, 1)

            next_action = np.argmax(q, axis=1).reshape(-1, 1)
            phi_next_state_next_action = get_action_features(
                phi_next_state, next_action, self.mdp_info.action_space.n)

            tmp = phi_state_action - self.mdp_info.gamma *\
                phi_next_state_next_action
            self._A += phi_state_action.T.dot(tmp)
            self._b += (phi_state_action.T.dot(reward)).reshape(-1, 1)

            old_w = self.approximator.get_weights()
            if np.linalg.matrix_rank(self._A) == self._A.shape[1]:
                w = np.linalg.solve(self._A, self._b).ravel()
            else:
                w = np.linalg.pinv(self._A).dot(self._b).ravel()
            self.approximator.set_weights(w)

            norm = np.linalg.norm(w - old_w)
Пример #4
0
    def _fit_boosted(self, x):
        """
        Single fit iteration for boosted FQI.

        Args:
            x (list): the dataset.

        """
        state, action, reward, next_state, absorbing, _ = parse_dataset(x)
        if self._target is None:
            self._target = reward
        else:
            self._next_q += self.approximator.predict(next_state,
                                                      idx=self._idx - 1)
            if np.any(absorbing):
                self._next_q *= 1 - absorbing.reshape(-1, 1)

            max_q = np.max(self._next_q, axis=1)
            self._target = reward + self.mdp_info.gamma * max_q

        self._target -= self._prediction
        self._prediction += self._target

        self.approximator.fit(state,
                              action,
                              self._target,
                              idx=self._idx,
                              **self._fit_params)

        self._idx += 1
Пример #5
0
    def _fit(self, x):
        state = list()
        action = list()
        reward = list()
        next_state = list()
        absorbing = list()

        half = len(x) // 2
        for i in range(2):
            s, a, r, ss, ab, _ = parse_dataset(x[i * half:(i + 1) * half])
            state.append(s)
            action.append(a)
            reward.append(r)
            next_state.append(ss)
            absorbing.append(ab)

        if self._target is None:
            self._target = reward
        else:
            for i in range(2):
                q_i = self.approximator.predict(next_state[i], idx=i)

                amax_q = np.expand_dims(np.argmax(q_i, axis=1), axis=1)
                max_q = self.approximator.predict(next_state[i], amax_q,
                                                  idx=1 - i)
                if np.any(absorbing[i]):
                    max_q *= 1 - absorbing[i]
                self._target[i] = reward[i] + self.mdp_info.gamma * max_q

        for i in range(2):
            self.approximator.fit(state[i], action[i], self._target[i], idx=i,
                                  **self._fit_params)
Пример #6
0
    def fit(self, x):
        state, action, reward, next_state, absorbing, _ = parse_dataset(x)
        for _ in trange(self._n_iterations(),
                        dynamic_ncols=True,
                        disable=self._quiet,
                        leave=False):
            if self._target is None:
                self._target = reward
            else:
                self._next_q += self.approximator.predict(next_state,
                                                          idx=self._idx - 1)
                if np.any(absorbing):
                    self._next_q *= 1 - absorbing.reshape(-1, 1)

                max_q = np.max(self._next_q, axis=1)
                self._target = reward + self.mdp_info.gamma * max_q

            self._target -= self._prediction
            self._prediction += self._target

            self.approximator.fit(state,
                                  action,
                                  self._target,
                                  idx=self._idx,
                                  **self._fit_params)

            self._idx += 1
Пример #7
0
def compute_metrics(core, eval_params, agent_builder, cmp_E):
    """
    Function to compute the metrics.

    Args:
        eval_params (dict): parameters for running the evaluation
        agent_builder (AgentBuilder): the agent builder
        cmp_E (bool): select if policy entropy should be computed
    """
    dataset = core.evaluate(**eval_params)

    # Compute J
    J = np.mean(compute_J(dataset, core.mdp.info.gamma))

    # Compute R
    R = np.mean(compute_J(dataset))

    # Compute Q
    states = get_init_states(dataset)
    Q = agent_builder.compute_Q(agent=core.agent, states=states)

    # Compute Policy Entropy
    E = None
    if cmp_E:
        if agent_builder.compute_entropy_with_states:
            E = core.agent.policy.entropy(parse_dataset(dataset)[0])
        else:
            E = core.agent.policy.entropy()

    return J, R, Q, E
Пример #8
0
    def fit(self, dataset):
            state, action, reward, next_state, absorbing, _ = parse_dataset(dataset)

            v, adv = compute_advantage_montecarlo(self._V, state, next_state,
                                                  reward, absorbing,
                                                  self.mdp_info.gamma)
            self._V.fit(state, v, **self._critic_fit_params)

            loss = self._loss(state, action, adv)
            self._optimize_actor_parameters(loss)
Пример #9
0
    def fit(self, dataset):
        if not self._quiet:
            tqdm.write('Iteration ' + str(self._iter))

        state, action, reward, next_state, absorbing, last = parse_dataset(
            dataset)
        x = state.astype(np.float32)
        u = action.astype(np.float32)
        r = reward.astype(np.float32)
        xn = next_state.astype(np.float32)

        obs = to_float_tensor(x, self.policy.use_cuda)
        act = to_float_tensor(u, self.policy.use_cuda)
        v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last,
                                       self.mdp_info.gamma, self._lambda)
        np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8)
        adv = to_float_tensor(np_adv, self.policy.use_cuda)

        # Policy update
        self._old_policy = deepcopy(self.policy)
        old_pol_dist = self._old_policy.distribution_t(obs)
        old_log_prob = self._old_policy.log_prob_t(obs, act).detach()

        zero_grad(self.policy.parameters())
        loss = self._compute_loss(obs, act, adv, old_log_prob)

        prev_loss = loss.item()

        # Compute Gradient
        loss.backward()
        g = get_gradient(self.policy.parameters())

        # Compute direction through conjugate gradient
        stepdir = self._conjugate_gradient(g, obs, old_pol_dist)

        # Line search
        self._line_search(obs, act, adv, old_log_prob, old_pol_dist, prev_loss,
                          stepdir)

        # VF update
        self._V.fit(x, v_target, **self._critic_fit_params)

        # Print fit information
        self._print_fit_info(dataset, x, v_target, old_pol_dist)
        self._iter += 1
Пример #10
0
    def _fit(self, x):
        """
        Single fit iteration.

        Args:
            x (list): the dataset.

        """
        state, action, reward, next_state, absorbing, _ = parse_dataset(x)
        if self._target is None:
            self._target = reward
        else:
            q = self.approximator.predict(next_state)
            if np.any(absorbing):
                q *= 1 - absorbing.reshape(-1, 1)

            max_q = np.max(q, axis=1)
            self._target = reward + self.mdp_info.gamma * max_q

        self.approximator.fit(state, action, self._target, **self._fit_params)
Пример #11
0
def experiment(alg, n_epochs, n_steps, n_episodes_test):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    gamma = 0.99
    habitat_root_path = Habitat.root_path()
    config_file = os.path.join(
        habitat_root_path, 'habitat_baselines/config/rearrange/rl_pick.yaml')
    base_config_file = os.path.join(habitat_root_path,
                                    'configs/tasks/rearrange/pick.yaml')
    wrapper = 'HabitatRearrangeWrapper'
    mdp = Habitat(wrapper, config_file, base_config_file, gamma=gamma)

    # Settings
    initial_replay_size = 64
    max_replay_size = 50000
    batch_size = 64
    n_features = 64
    warmup_transitions = 100
    tau = 0.005
    lr_alpha = 3e-4

    use_cuda = torch.cuda.is_available()

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=ActorNetwork,
                           n_features=n_features,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=use_cuda)
    actor_sigma_params = dict(network=ActorNetwork,
                              n_features=n_features,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=use_cuda)

    actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}}

    critic_input_shape = actor_input_shape + mdp.info.action_space.shape
    critic_params = dict(network=CriticNetwork,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1, ),
                         use_cuda=use_cuda)

    # Agent
    agent = alg(mdp.info,
                actor_mu_params,
                actor_sigma_params,
                actor_optimizer,
                critic_params,
                batch_size,
                initial_replay_size,
                max_replay_size,
                warmup_transitions,
                tau,
                lr_alpha,
                critic_fit_params=None)

    # Algorithm
    core = Core(agent, mdp)

    # RUN
    dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
    s, *_ = parse_dataset(dataset)

    J = np.mean(compute_J(dataset, mdp.info.gamma))
    R = np.mean(compute_J(dataset))
    E = agent.policy.entropy(s)

    logger.epoch_info(0, J=J, R=R, entropy=E)

    core.learn(n_steps=initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    for n in trange(n_epochs, leave=False):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
        s, *_ = parse_dataset(dataset)

        J = np.mean(compute_J(dataset, mdp.info.gamma))
        R = np.mean(compute_J(dataset))
        E = agent.policy.entropy(s)

        logger.epoch_info(n + 1, J=J, R=R, entropy=E)

    logger.info('Press a button to visualize the robot')
    input()
    core.evaluate(n_episodes=5, render=True)
Пример #12
0
def experiment(alg, n_epochs, n_steps, n_steps_test):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    horizon = 200
    gamma = 0.99
    mdp = Gym('Pendulum-v1', horizon, gamma)

    # Settings
    initial_replay_size = 64
    max_replay_size = 50000
    batch_size = 64
    n_features = 64
    warmup_transitions = 100
    tau = 0.005
    lr_alpha = 3e-4

    use_cuda = torch.cuda.is_available()

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=ActorNetwork,
                           n_features=n_features,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=use_cuda)
    actor_sigma_params = dict(network=ActorNetwork,
                              n_features=n_features,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=use_cuda)

    actor_optimizer = {'class': optim.Adam,
                       'params': {'lr': 3e-4}}

    critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
    critic_params = dict(network=CriticNetwork,
                         optimizer={'class': optim.Adam,
                                    'params': {'lr': 3e-4}},
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1,),
                         use_cuda=use_cuda)

    # Agent
    agent = alg(mdp.info, actor_mu_params, actor_sigma_params,
                actor_optimizer, critic_params, batch_size, initial_replay_size,
                max_replay_size, warmup_transitions, tau, lr_alpha,
                critic_fit_params=None)

    # Algorithm
    core = Core(agent, mdp)

    # RUN
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    s, *_ = parse_dataset(dataset)

    J = np.mean(compute_J(dataset, mdp.info.gamma))
    R = np.mean(compute_J(dataset))
    E = agent.policy.entropy(s)

    logger.epoch_info(0, J=J, R=R, entropy=E)

    core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size)

    for n in trange(n_epochs, leave=False):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        s, *_ = parse_dataset(dataset)

        J = np.mean(compute_J(dataset, mdp.info.gamma))
        R = np.mean(compute_J(dataset))
        E = agent.policy.entropy(s)

        logger.epoch_info(n+1, J=J, R=R, entropy=E)

    logger.info('Press a button to visualize pendulum')
    input()
    core.evaluate(n_episodes=5, render=True)