示例#1
0
def _add_log_prob_and_value_to_episodes(
    episodes,
    model,
    phi,
    batch_states,
    obs_normalizer,
):

    dataset = list(itertools.chain.from_iterable(episodes))
    xp = model.xp

    # Compute v_pred and next_v_pred
    states = batch_states([b['state'] for b in dataset], xp, phi)
    next_states = batch_states([b['next_state'] for b in dataset], xp, phi)

    if obs_normalizer:
        states = obs_normalizer(states, update=False)
        next_states = obs_normalizer(next_states, update=False)

    with chainer.using_config('train', False), chainer.no_backprop_mode():
        distribs, vs_pred = model(states)
        _, next_vs_pred = model(next_states)

        actions = xp.array([b['action'] for b in dataset])
        log_probs = chainer.cuda.to_cpu(distribs.log_prob(actions).array)
        vs_pred = chainer.cuda.to_cpu(vs_pred.array.ravel())
        next_vs_pred = chainer.cuda.to_cpu(next_vs_pred.array.ravel())

    for transition, log_prob, v_pred, next_v_pred in zip(
            dataset, log_probs, vs_pred, next_vs_pred):
        transition['log_prob'] = log_prob
        transition['v_pred'] = v_pred
        transition['next_v_pred'] = next_v_pred
示例#2
0
    def _make_dataset(self):
        dataset = list(itertools.chain.from_iterable(self.memory))
        xp = self.vf.xp

        # Compute v_pred and next_v_pred
        states = batch_states([b['state'] for b in dataset], xp, self.phi)
        next_states = batch_states([b['next_state'] for b in dataset], xp,
                                   self.phi)
        if self.obs_normalizer:
            states = self.obs_normalizer(states, update=False)
            next_states = self.obs_normalizer(next_states, update=False)
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            vs_pred = chainer.cuda.to_cpu(self.vf(states).data.ravel())
            next_vs_pred = chainer.cuda.to_cpu(
                self.vf(next_states).data.ravel())
        for transition, v_pred, next_v_pred in zip(dataset, vs_pred,
                                                   next_vs_pred):
            transition['v_pred'] = v_pred
            transition['next_v_pred'] = next_v_pred

        # Update stats
        self.value_record.extend(vs_pred)

        # Compute adv and v_teacher
        for episode in self.memory:
            adv = 0.0
            for transition in reversed(episode):
                td_err = (transition['reward'] +
                          (self.gamma * transition['nonterminal'] *
                           transition['next_v_pred']) - transition['v_pred'])
                adv = td_err + self.gamma * self.lambd * adv
                transition['adv'] = adv
                transition['v_teacher'] = adv + transition['v_pred']

        return dataset
示例#3
0
def batch_recurrent_experiences(experiences,
                                model,
                                xp,
                                phi,
                                gamma,
                                batch_states=batch_states):
    """Batch experiences for recurrent model updates.

    Args:
        experiences: list of episodes. Each episode is a list
            containing between 1 and n dicts, each containing:
              - state (object): State
              - action (object): Action
              - reward (float): Reward
              - is_state_terminal (bool): True iff next state is terminal
              - next_state (object): Next state
        model (chainer.Link): Model that implements StatelessRecurrent.
        xp : Numpy compatible matrix library: e.g. Numpy or CuPy.
        phi : Preprocessing function
        gamma: discount factor
        batch_states: function that converts a list to a batch
    Returns:
        dict of batched transitions
    """
    flat_transitions = list(itertools.chain.from_iterable(experiences))
    batch_exp = {
        'state': [
            batch_states([transition['state'] for transition in ep], xp, phi)
            for ep in experiences
        ],
        'action':
        xp.array([transition['action'] for transition in flat_transitions]),
        'reward':
        xp.array([transition['reward'] for transition in flat_transitions],
                 dtype=np.float32),
        'next_state': [
            batch_states([transition['next_state']
                          for transition in ep], xp, phi) for ep in experiences
        ],
        'is_state_terminal':
        xp.array([
            transition['is_state_terminal'] for transition in flat_transitions
        ],
                 dtype=np.float32),
        'discount':
        xp.full(len(flat_transitions), gamma, dtype=np.float32),
        'recurrent_state':
        model.concatenate_recurrent_states(
            [ep[0]['recurrent_state'] for ep in experiences]),
        'next_recurrent_state':
        model.concatenate_recurrent_states(
            [ep[0]['next_recurrent_state'] for ep in experiences]),
    }
    # Batch next actions only when all the transitions have them
    if all(transition['next_action'] is not None
           for transition in flat_transitions):
        batch_exp['next_action'] = xp.asarray(
            [transition['next_action'] for transition in flat_transitions])
    return batch_exp
示例#4
0
def _add_log_prob_and_value_to_episodes_recurrent(
    episodes,
    model,
    phi,
    batch_states,
    obs_normalizer,
):
    xp = model.xp

    # Prepare data for a recurrent model
    seqs_states = []
    seqs_next_states = []
    for ep in episodes:
        states = batch_states([transition['state'] for transition in ep], xp,
                              phi)
        next_states = batch_states(
            [transition['next_state'] for transition in ep], xp, phi)
        if obs_normalizer:
            states = obs_normalizer(states, update=False)
            next_states = obs_normalizer(next_states, update=False)
        seqs_states.append(states)
        seqs_next_states.append(next_states)

    flat_transitions = list(itertools.chain.from_iterable(episodes))

    # Predict values using a recurrent model
    with chainer.using_config('train', False), chainer.no_backprop_mode():
        rs = model.concatenate_recurrent_states(
            [ep[0]['recurrent_state'] for ep in episodes])
        next_rs = model.concatenate_recurrent_states(
            [ep[0]['next_recurrent_state'] for ep in episodes])
        assert len(rs) == len(next_rs)

        (flat_distribs,
         flat_vs), _ = model.n_step_forward(seqs_states,
                                            recurrent_state=rs,
                                            output_mode='concat')
        (_, flat_next_vs), _ = model.n_step_forward(seqs_next_states,
                                                    recurrent_state=next_rs,
                                                    output_mode='concat')

        flat_actions = xp.array([b['action'] for b in flat_transitions])
        flat_log_probs = flat_distribs.log_prob(flat_actions)
        flat_log_probs = chainer.cuda.to_cpu(flat_log_probs.array)
        flat_vs = chainer.cuda.to_cpu(flat_vs.array)
        flat_next_vs = chainer.cuda.to_cpu(flat_next_vs.array)

    # Add predicted values to transitions
    for transition, log_prob, v, next_v in zip(flat_transitions,
                                               flat_log_probs, flat_vs,
                                               flat_next_vs):
        transition['log_prob'] = float(log_prob)
        transition['v_pred'] = float(v)
        transition['next_v_pred'] = float(next_v)
示例#5
0
def batch_experiences(experiences, xp, phi, gamma, batch_states=batch_states):
    """Takes a batch of k experiences each of which contains j

    consecutive transitions and vectorizes them, where j is between 1 and n.

    Args:
        experiences: list of experiences. Each experience is a list
            containing between 1 and n dicts containing
              - state (object): State
              - action (object): Action
              - reward (float): Reward
              - is_state_terminal (bool): True iff next state is terminal
              - next_state (object): Next state
        xp : Numpy compatible matrix library: e.g. Numpy or CuPy.
        phi : Preprocessing function
        gamma: discount factor
        batch_states: function that converts a list to a batch
    Returns:
        dict of batched transitions
    """

    batch_exp = {
        'state':
        batch_states([elem[0]['state'] for elem in experiences], xp, phi),
        'action':
        xp.asarray([elem[0]['action'] for elem in experiences]),
        'reward':
        xp.asarray([
            sum((gamma**i) * exp[i]['reward'] for i in range(len(exp)))
            for exp in experiences
        ],
                   dtype=np.float32),
        'next_state':
        batch_states([elem[-1]['next_state'] for elem in experiences], xp,
                     phi),
        'is_state_terminal':
        xp.asarray([
            any(transition['is_state_terminal'] for transition in exp)
            for exp in experiences
        ],
                   dtype=np.float32),
        'discount':
        xp.asarray([(gamma**len(elem)) for elem in experiences],
                   dtype=np.float32)
    }
    if all(elem[-1]['next_action'] is not None for elem in experiences):
        batch_exp['next_action'] = xp.asarray(
            [elem[-1]['next_action'] for elem in experiences])
    return batch_exp
示例#6
0
    def update(self):
        xp = self.xp

        if self.standardize_advantages:
            all_advs = xp.array([b['adv'] for b in self.memory])
            mean_advs = xp.mean(all_advs)
            std_advs = xp.std(all_advs)

        target_model = copy.deepcopy(self.model)
        dataset_iter = chainer.iterators.SerialIterator(
            self.memory, self.minibatch_size)

        dataset_iter.reset()
        while dataset_iter.epoch < self.epochs:
            batch = dataset_iter.__next__()
            states = batch_states([b['state'] for b in batch], xp, self.phi)
            actions = xp.array([b['action'] for b in batch])
            distribs, vs_pred = self.model(states)
            with chainer.no_backprop_mode():
                target_distribs, _ = target_model(states)

            advs = xp.array([b['adv'] for b in batch], dtype=xp.float32)
            if self.standardize_advantages:
                advs = (advs - mean_advs) / std_advs

            self.optimizer.update(
                self._lossfun,
                distribs, vs_pred, distribs.log_prob(actions),
                vs_pred_old=xp.array(
                    [b['v_pred'] for b in batch], dtype=xp.float32),
                target_log_probs=target_distribs.log_prob(actions),
                advs=advs,
                vs_teacher=xp.array(
                    [b['v_teacher'] for b in batch], dtype=xp.float32),
                )
示例#7
0
    def act_and_train(self, obs, reward):
        if hasattr(self.model, 'obs_filter'):
            xp = self.xp
            b_state = batch_states([obs], xp, self.phi)
            self.model.obs_filter.experience(b_state)

        action, v = self._act(obs)

        # Update stats
        self.average_v += ((1 - self.average_v_decay) *
                           (v[0] - self.average_v))

        if self.last_state is not None:
            self.last_episode.append({
                'state': self.last_state,
                'action': self.last_action,
                'reward': reward,
                'v_pred': self.last_v,
                'next_state': obs,
                'next_v_pred': v,
                'nonterminal': 1.0
            })
        self.last_state = obs
        self.last_action = action
        self.last_v = v

        self._train()
        return action
示例#8
0
def batch_experiences(experiences, xp, phi, batch_states=batch_states):

    return {
        'state': batch_states(
            [elem['state'] for elem in experiences], xp, phi),
        'action': xp.asarray([elem['action'] for elem in experiences]),
        'reward': xp.asarray(
            [elem['reward'] for elem in experiences], dtype=np.float32),
        'next_state': batch_states(
            [elem['next_state'] for elem in experiences], xp, phi),
        'next_action': xp.asarray(
            [elem['next_action'] for elem in experiences]),
        'is_state_terminal': xp.asarray(
            [elem['is_state_terminal'] for elem in experiences],

            dtype=np.float32)}
示例#9
0
def sample_from_policy(env, model, obs_normalizer):
    xp = np
    phi = lambda x: x

    states = []
    actions = []

    obs = env.reset().astype('float32')  # Initial state
    states.append(obs)
    done = False
    while not done:
        b_state = batch_states([obs], xp, phi)
        b_state = obs_normalizer(b_state, update=False)

        with chainer.using_config('train', False), chainer.no_backprop_mode():
            action_distrib, _ = model(b_state)
            action = chainer.cuda.to_cpu(action_distrib.sample().array)[0]

        actions.append(action)

        new_obs, _, done, _ = env.step(action)
        obs = new_obs.astype('float32')

        if not done: states.append(obs)

    return states, actions
def sample_probs_and_actions_from_policy(env,
                                         model,
                                         obs_normalizer,
                                         initial_state=None):
    xp = np
    phi = lambda x: x

    probs = []
    actions = []

    obs = env.reset().astype('float32')  # Initial state

    if initial_state is not None:
        env.state = initial_state
        obs = np.array(initial_state).astype('float32')

    done = False
    while not done:
        b_state = batch_states([obs], xp, phi)

        if obs_normalizer:
            b_state = obs_normalizer(b_state, update=False)

        with chainer.using_config('train', False), chainer.no_backprop_mode():
            action_distrib, _ = model(b_state)
            action = chainer.cuda.to_cpu(action_distrib.sample().array)[0]

        probs.append(action_distrib.all_prob.data[0][-1])
        actions.append(action)
        new_obs, _, done, _ = env.step(action)
        obs = new_obs.astype('float32')

    return probs, actions
示例#11
0
    def act_and_train(self, state, reward):

        xp = self.xp
        b_state = batch_states([state], xp, self.phi)

        if self.obs_normalizer:
            b_state = self.obs_normalizer(b_state, update=False)

        # action_distrib will be recomputed when computing gradients
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            action_distrib = self.policy(b_state)
            action = chainer.cuda.to_cpu(action_distrib.sample().data)[0]
            self.entropy_record.append(float(action_distrib.entropy.data))

        self.logger.debug('action_distrib: %s', action_distrib)
        self.logger.debug('action: %s', action)

        if self.last_state is not None:
            self.last_episode.append({
                'state': self.last_state,
                'action': self.last_action,
                'reward': reward,
                'next_state': state,
                'nonterminal': 1.0,
            })
        self.last_state = state
        self.last_action = action

        self._update_if_dataset_is_ready()

        return action
示例#12
0
 def _act(self, state):
     xp = self.xp
     with chainer.using_config('train', False):
         b_state = batch_states([state], xp, self.phi)
         with chainer.no_backprop_mode():
             action_distrib, v = self.model(b_state)
             action = action_distrib.sample()
         return cuda.to_cpu(action.data)[0], cuda.to_cpu(v.data)[0]
示例#13
0
 def act(self, state):
     xp = self.xp
     b_state = batch_states([state], xp, self.phi)
     if self.obs_normalizer:
         b_state = self.obs_normalizer(b_state, update=False)
     with chainer.using_config('train', False), chainer.no_backprop_mode():
         action_distrib = self.policy(b_state)
         if self.act_deterministically:
             action = chainer.cuda.to_cpu(
                 action_distrib.most_probable.data)[0]
         else:
             action = chainer.cuda.to_cpu(action_distrib.sample().data)[0]
     return action
示例#14
0
    def _update_policy(self, dataset):
        """Update the policy using a given dataset.

        The policy is updated via CG and line search.
        """

        assert 'state' in dataset[0]
        assert 'action' in dataset[0]
        assert 'adv' in dataset[0]

        # Use full-batch
        xp = self.policy.xp
        states = batch_states([b['state'] for b in dataset], xp, self.phi)
        if self.obs_normalizer:
            states = self.obs_normalizer(states, update=False)
        actions = xp.array([b['action'] for b in dataset])
        advs = xp.array([b['adv'] for b in dataset], dtype=np.float32)
        if self.standardize_advantages:
            mean_advs = xp.mean(advs)
            std_advs = xp.std(advs)
            advs = (advs - mean_advs) / (std_advs + 1e-8)

        # Recompute action distributions for batch backprop
        action_distrib = self.policy(states)

        log_prob_old = xp.array(
            [transition['log_prob'] for transition in dataset],
            dtype=np.float32)

        gain = self._compute_gain(
            log_prob=action_distrib.log_prob(actions),
            log_prob_old=log_prob_old,
            entropy=action_distrib.entropy,
            advs=advs)

        # Distribution to compute KL div against
        action_distrib_old = action_distrib.copy()

        full_step = self._compute_kl_constrained_step(
            action_distrib=action_distrib,
            action_distrib_old=action_distrib_old,
            gain=gain)

        self._line_search(
            full_step=full_step,
            dataset=dataset,
            advs=advs,
            action_distrib_old=action_distrib_old,
            gain=gain)
示例#15
0
def batch_trajectory(trajectory, xp, phi, gamma, batch_states=batch_states):
    batch_tr = {
        'state':
        batch_states([elem['state'] for elem in trajectory], xp, phi),
        'action':
        np.asarray([elem['action'] for elem in trajectory], dtype=np.int32),
        'reward':
        np.asarray([elem['reward'] for elem in trajectory], dtype=np.float32),
        'is_state_terminal':
        np.asarray([elem['is_state_terminal'] for elem in trajectory],
                   dtype=np.float32),
        'embedding': [elem['embedding'] for elem in trajectory]
    }

    return batch_tr
示例#16
0
文件: eva.py 项目: dkuyoshi/SimpleEVA
    def backup_store_if_necessary(self, embedding, t):
        if self.model.lambdas == 0 or self.model.lambdas == 1:
            return
        if (t % self.periodic_steps
                == 0) and (self.t >= self.replay_buffer.capacity):
            self.replay_buffer.update_embedding()
            trajectories = self.replay_buffer.lookup(embedding)
            batch_trajectory = [{
                'state':
                batch_states([elem[0]['state']
                              for elem in traject], self.xp, self.phi),
                'action': [elem[0]['action'] for elem in traject],
                'reward': [elem[0]['reward'] for elem in traject],
                'embedding': [elem[0]['embedding'] for elem in traject]
            } for traject in trajectories]

            qnp, embeddings = self._trajectory_centric_planning(
                batch_trajectory)
            self.value_buffer.store(embeddings, qnp)
示例#17
0
    def _update_vf(self, dataset):
        """Update the value function using a given dataset.

        The value function is updated via SGD to minimize TD(lambda) errors.
        """

        xp = self.vf.xp

        assert 'state' in dataset[0]
        assert 'v_teacher' in dataset[0]

        dataset_iter = chainer.iterators.SerialIterator(
            dataset, self.vf_batch_size)

        while dataset_iter.epoch < self.vf_epochs:
            batch = dataset_iter.__next__()
            states = batch_states([b['state'] for b in batch], xp, self.phi)
            if self.obs_normalizer:
                states = self.obs_normalizer(states, update=False)
            vs_teacher = xp.array(
                [b['v_teacher'] for b in batch], dtype=xp.float32)
            vs_pred = self.vf(states)
            vf_loss = F.mean_squared_error(vs_pred, vs_teacher[..., None])
            self.vf_optimizer.update(lambda: vf_loss)
示例#18
0
 def _update_obs_normalizer(self, dataset):
     assert self.obs_normalizer
     states = batch_states(
         [b['state'] for b in dataset], self.obs_normalizer.xp, self.phi)
     self.obs_normalizer.experience(states)