Python Rollout примеры использования

Язык программирования: Python

Пространство имен/Пакет: btgym.algorithms

Класс/Тип: Rollout

Примеров на hotexamples.com: 7

Python Rollout - 7 примеров найдено. Это лучшие примеры Python кода для btgym.algorithms.Rollout, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Rollout(7)

add(5)

add_memory_sample(2)

process(2)

Пример #1

Показать файл

    def sample_uniform(self, sequence_size):
        """
        Uniformly samples sequence of successive frames of size `sequence_size` or less (~off-policy rollout).

        Args:
            sequence_size:  maximum sample size.
        Returns:
            instance of Rollout of size <= sequence_size.
        """
        start_pos = np.random.randint(0,
                                      self._history_size - sequence_size - 1)
        # Shift by one if hit terminal frame:
        if self._frames[start_pos]['terminal']:
            start_pos += 1  # assuming that there are no successive terminal frames.

        sampled_rollout = Rollout()

        for i in range(sequence_size):
            frame = self._frames[start_pos + i]
            sampled_rollout.add(frame)
            if frame['terminal']:
                break  # it's ok to return less than `sequence_size` frames if `terminal` frame encountered.

        return sampled_rollout

Пример #2

Показать файл

Файл: runner.py Проект: jreuben11/btgym

def env_runner(sess, env, policy, task, rollout_length, summary_writer,
               episode_summary_freq, env_render_freq, atari_test, ep_summary,
               memory_config):
    """
    The logic of the thread runner.
    In brief, it constantly keeps on running
    the policy, and as long as the rollout exceeds a certain length, the thread
    runner appends all the collected data to the queue.

    Args:
        env:                    environment instance
        policy:                 policy instance
        task:                   int
        rollout_length:         int
        episode_summary_freq:   int
        env_render_freq:        int
        atari_test:             bool, Atari or BTGyn
        ep_summary:             dict of tf.summary op and placeholders
        memory_config:          replay memory configuration dictionary

    Yelds:
        collected data as dictionary of on_policy, off_policy rollouts and episode statistics.
    """
    if memory_config is not None:
        memory = memory_config['class_ref'](**memory_config['kwargs'])

    else:
        memory = _DummyMemory()
    # Pass sample config to environment:
    last_state = env.reset(**policy.get_sample_config())
    last_context = policy.get_initial_features(state=last_state)
    length = 0
    local_episode = 0
    reward_sum = 0
    last_action = np.zeros(env.action_space.n)
    last_action[0] = 1
    last_reward = 0.0
    last_action_reward = np.concatenate(
        [last_action, np.asarray([last_reward])], axis=-1)

    # Summary averages accumulators:
    total_r = []
    cpu_time = []
    final_value = []
    total_steps = []
    total_steps_atari = []

    ep_stat = None
    test_ep_stat = None
    render_stat = None

    while True:
        terminal_end = False
        rollout = Rollout()

        action, value_, context = policy.act(last_state, last_context,
                                             last_action_reward)

        # argmax to convert from one-hot:
        state, reward, terminal, info = env.step(action.argmax())

        # Partially collect first experience of rollout:
        last_experience = {
            'position': {
                'episode': local_episode,
                'step': length
            },
            'state': last_state,
            'action': action,
            'reward': reward,
            'value': value_,
            'terminal': terminal,
            'context': last_context,
            'last_action_reward': last_action_reward,
        }
        # Execute user-defined callbacks to policy, if any:
        for key, callback in policy.callback.items():
            last_experience[key] = callback(**locals())

        length += 1
        reward_sum += reward
        last_state = state
        last_context = context
        last_action = action
        last_reward = reward
        last_action_reward = np.concatenate(
            [last_action, np.asarray([last_reward])], axis=-1)

        for roll_step in range(1, rollout_length):
            if not terminal:
                # Continue adding experiences to rollout:
                action, value_, context = policy.act(last_state, last_context,
                                                     last_action_reward)

                # Argmax to convert from one-hot:
                state, reward, terminal, info = env.step(action.argmax())
                #if not atari_test:
                #        state = state['model_input']

                # Partially collect next experience:
                experience = {
                    'position': {
                        'episode': local_episode,
                        'step': length
                    },
                    'state': last_state,
                    'action': action,
                    'reward': reward,
                    'value': value_,
                    'terminal': terminal,
                    'context': last_context,
                    'last_action_reward': last_action_reward,
                    #'pixel_change': 0 #policy.get_pc_target(state, last_state),
                }
                for key, callback in policy.callback.items():
                    experience[key] = callback(**locals())

                # Bootstrap to complete and push previous experience:
                last_experience['r'] = value_
                rollout.add(last_experience)
                memory.add(last_experience)

                # Housekeeping:
                length += 1
                reward_sum += reward
                last_state = state
                last_context = context
                last_action = action
                last_reward = reward
                last_action_reward = np.concatenate(
                    [last_action, np.asarray([last_reward])], axis=-1)
                last_experience = experience

            if terminal:
                # Finished episode within last taken step:
                terminal_end = True
                # All environment-specific summaries are here due to fact
                # only runner allowed to interact with environment:
                # Accumulate values for averaging:
                total_r += [reward_sum]
                total_steps_atari += [length]
                if not atari_test:
                    episode_stat = env.get_stat()  # get episode statistic
                    last_i = info[-1]  # pull most recent info
                    cpu_time += [episode_stat['runtime'].total_seconds()]
                    final_value += [last_i['broker_value']]
                    total_steps += [episode_stat['length']]
                #print('last_episode.metadata:', state['metadata'])

                # Episode statistics:
                try:
                    # Was it test episode ( `type` in metadata is not zero)?
                    if not atari_test and state['metadata']['type']:
                        is_test_episode = True

                    else:
                        is_test_episode = False

                except KeyError:
                    is_test_episode = False

                if is_test_episode:
                    #print(task, total_r)
                    test_ep_stat = dict(total_r=total_r[-1],
                                        final_value=final_value[-1],
                                        steps=total_steps[-1])
                else:
                    if local_episode % episode_summary_freq == 0:
                        if not atari_test:
                            # BTgym:
                            ep_stat = dict(total_r=np.average(total_r),
                                           cpu_time=np.average(cpu_time),
                                           final_value=np.average(final_value),
                                           steps=np.average(total_steps))
                        else:
                            # Atari:
                            ep_stat = dict(total_r=np.average(total_r),
                                           steps=np.average(total_steps_atari))
                        total_r = []
                        cpu_time = []
                        final_value = []
                        total_steps = []
                        total_steps_atari = []

                if task == 0 and local_episode % env_render_freq == 0:
                    if not atari_test:
                        # Render environment (chief worker only, and not in atari atari_test mode):
                        render_stat = {
                            mode: env.render(mode)[None, :]
                            for mode in env.render_modes
                        }
                    else:
                        # Atari:
                        render_stat = dict(
                            render_atari=state['external'][None, :] * 255)

                # New episode:
                last_state = env.reset(**policy.get_sample_config())
                last_context = policy.get_initial_features(
                    state=last_state, context=last_context)
                length = 0
                reward_sum = 0
                last_action = np.zeros(env.action_space.n)
                last_action[0] = 1
                last_reward = 0.0
                last_action_reward = np.concatenate(
                    [last_action, np.asarray([last_reward])], axis=-1)

                # Increment global and local episode counts:
                sess.run(policy.inc_episode)
                local_episode += 1
                break

        # After rolling `rollout_length` or less (if got `terminal`)
        # complete final experience of the rollout:
        if not terminal_end:
            # Bootstrap:
            last_experience['r'] = np.asarray([
                policy.get_value(last_state, last_context, last_action_reward)
            ])

        else:
            last_experience['r'] = np.asarray([0.0])

        rollout.add(last_experience)

        # Only training rollouts are added to replay memory:
        try:
            # Was it test (`type` in metadata is not zero)?
            if not atari_test and last_experience['state']['metadata']['type']:
                is_test = True

            else:
                is_test = False

        except KeyError:
            is_test = False

        if not is_test:
            memory.add(last_experience)

        #print('last_experience {}'.format(last_experience['position']))
        #for k, v in last_experience.items():
        #    try:
        #        print(k, 'shape: ', v.shape)
        #    except:
        #        try:
        #            print(k, 'type: ', type(v), 'len: ', len(v))
        #        except:
        #            print(k, 'type: ', type(v), 'value: ', v)

        #print('rollout_step: {}, last_exp/frame_pos: {}\nr: {}, v: {}, v_next: {}, t: {}'.
        #    format(
        #        length,
        #        last_experience['position'],
        #        last_experience['reward'],
        #        last_experience['value'],
        #        last_experience['value_next'],
        #        last_experience['terminal']
        #    )
        #)
        #print('rollout size: {}, last r: {}'.format(len(rollout.position), rollout.r[-1]))
        #print('last value_next: ', last_experience['value_next'], ', rollout flushed.')

        # Once we have enough experience and memory can be sampled, yield it,
        # and have the ThreadRunner place it on a queue:
        if memory.is_full():
            data = dict(
                on_policy=rollout,
                off_policy=memory.sample_uniform(sequence_size=rollout_length),
                off_policy_rp=memory.sample_priority(exact_size=True),
                ep_summary=ep_stat,
                test_ep_summary=test_ep_stat,
                render_summary=render_stat,
            )
            yield data

            ep_stat = None
            test_ep_stat = None
            render_stat = None

Пример #3

Показать файл

Файл: runner.py Проект: jaehyek/btgym

def env_runner(sess, env, policy, task, rollout_length, summary_writer,
               episode_summary_freq, env_render_freq, test, ep_summary):
    """The logic of the thread runner.
    In brief, it constantly keeps on running
    the policy, and as long as the rollout exceeds a certain length, the thread
    runner appends the rollout to the queue.

    Args:
        env:                    environment instance
        policy:                 policy instance
        task:                   int
        rollout_length:         int
        episode_summary_freq:   int
        env_render_freq:        int
        test:                   Atari or BTGyn
        ep_summary:             tf.summary

    Yelds:
        rollout instance
    """
    last_state = env.reset()
    if not test:
        last_state = last_state['model_input']

    last_context = policy.get_initial_features()
    length = 0
    local_episode = 0
    rewards = 0
    last_action = np.zeros(env.action_space.n)
    last_action[0] = 1
    last_reward = 0.0
    last_action_reward = np.concatenate(
        [last_action, np.asarray([last_reward])], axis=-1)

    # Summary averages accumulators:
    total_r = 0
    cpu_time = 0
    final_value = 0
    total_steps = 0
    total_steps_atari = 0

    while True:
        terminal_end = False
        rollout = Rollout()

        action, value_, context = policy.act(last_state, last_context,
                                             last_action_reward)

        # argmax to convert from one-hot:
        state, reward, terminal, info = env.step(action.argmax())
        #if not test:
        #    state = state['model_input']

        # Partially collect first experience of rollout:
        last_experience = {
            'position': {
                'episode': local_episode,
                'step': length
            },
            'state': last_state,
            'action': action,
            'reward': reward,
            'value': value_,
            'terminal': terminal,
            'context': last_context,
            'last_action_reward': last_action_reward,
            #'pixel_change': 0 #policy.get_pc_target(state, last_state),
        }
        # Execute user-defined callbacks to policy, if any:
        for key, callback in policy.callback.items():
            last_experience[key] = callback(**locals())

        length += 1
        rewards += reward
        last_state = state
        last_context = context
        last_action = action
        last_reward = reward
        last_action_reward = np.concatenate(
            [last_action, np.asarray([last_reward])], axis=-1)

        for roll_step in range(1, rollout_length):
            if not terminal:
                # Continue adding experiences to rollout:
                action, value_, context = policy.act(last_state, last_context,
                                                     last_action_reward)

                # Argmax to convert from one-hot:
                state, reward, terminal, info = env.step(action.argmax())
                #if not test:
                #        state = state['model_input']

                # Partially collect next experience:
                experience = {
                    'position': {
                        'episode': local_episode,
                        'step': length
                    },
                    'state': last_state,
                    'action': action,
                    'reward': reward,
                    'value': value_,
                    'terminal': terminal,
                    'context': last_context,
                    'last_action_reward': last_action_reward,
                    #'pixel_change': 0 #policy.get_pc_target(state, last_state),
                }
                for key, callback in policy.callback.items():
                    experience[key] = callback(**locals())

                # Bootstrap to complete and push previous experience:
                last_experience['r'] = value_
                rollout.add(last_experience)

                # Housekeeping:
                length += 1
                rewards += reward
                last_state = state
                last_context = context
                last_action = action
                last_reward = reward
                last_action_reward = np.concatenate(
                    [last_action, np.asarray([last_reward])], axis=-1)
                last_experience = experience

            if terminal:
                # Finished episode within last taken step:
                terminal_end = True
                # All environment-specific summaries are here due to fact
                # only runner allowed to interact with environment:
                # Accumulate values for averaging:
                total_r += rewards
                total_steps_atari += length
                if not test:
                    episode_stat = env.get_stat()  # get episode statistic
                    last_i = info[0]  # pull most recent info
                    cpu_time += episode_stat['runtime'].total_seconds()
                    final_value += last_i['broker_value']
                    total_steps += episode_stat['length']

                # Episode statistic:
                if local_episode % episode_summary_freq == 0:
                    if not test:
                        # BTgym:
                        fetched_episode_stat = sess.run(
                            ep_summary['stat_op'],
                            feed_dict={
                                ep_summary['total_r_pl']:
                                total_r / episode_summary_freq,
                                ep_summary['cpu_time_pl']:
                                cpu_time / episode_summary_freq,
                                ep_summary['final_value_pl']:
                                final_value / episode_summary_freq,
                                ep_summary['steps_pl']:
                                total_steps / episode_summary_freq
                            })
                    else:
                        # Atari:
                        fetched_episode_stat = sess.run(
                            ep_summary['test_stat_op'],
                            feed_dict={
                                ep_summary['total_r_pl']:
                                total_r / episode_summary_freq,
                                ep_summary['steps_pl']:
                                total_steps_atari / episode_summary_freq
                            })
                    summary_writer.add_summary(fetched_episode_stat,
                                               sess.run(policy.global_episode))
                    summary_writer.flush()
                    total_r = 0
                    cpu_time = 0
                    final_value = 0
                    total_steps = 0
                    total_steps_atari = 0

                if task == 0 and local_episode % env_render_freq == 0:
                    if not test:
                        # Render environment (chief worker only, and not in atari test mode):
                        renderings = sess.run(
                            ep_summary['render_op'],
                            feed_dict={
                                ep_summary['render_human_pl']:
                                env.render('human')[None, :],
                                ep_summary['render_model_input_pl']:
                                env.render('model_input')[None, :],
                                ep_summary['render_episode_pl']:
                                env.render('episode')[None, :],
                            })
                    else:
                        # Atari:
                        renderings = sess.run(
                            ep_summary['test_render_op'],
                            feed_dict={
                                ep_summary['render_atari_pl']:
                                state[None, :] * 255
                            })

                    summary_writer.add_summary(renderings,
                                               sess.run(policy.global_episode))
                    summary_writer.flush()

                # New episode:
                last_state = env.reset()
                #if not test:
                #    last_state = last_state['model_input']

                last_context = policy.get_initial_features()
                length = 0
                rewards = 0
                last_action = np.zeros(env.action_space.n)
                last_action[0] = 1
                last_reward = 0.0
                last_action_reward = np.concatenate(
                    [last_action, np.asarray([last_reward])], axis=-1)

                # Increment global and local episode counts:
                sess.run(policy.inc_episode)
                local_episode += 1
                break

        # After rolling `rollout_length` or less (if got `terminal`)
        # complete final experience of the rollout:
        if not terminal_end:
            # Bootstrap:
            last_experience['r'] = np.asarray([
                policy.get_value(last_state, last_context, last_action_reward)
            ])

        else:
            last_experience['r'] = np.asarray([0.0])

        rollout.add(last_experience)

        #print('last_experience {}'.format(last_experience['position']))
        #for k, v in last_experience.items():
        #    try:
        #        print(k, 'shape: ', v.shape)
        #    except:
        #        try:
        #            print(k, 'type: ', type(v), 'len: ', len(v))
        #        except:
        #            print(k, 'type: ', type(v), 'value: ', v)

        #print('rollout_step: {}, last_exp/frame_pos: {}\nr: {}, v: {}, v_next: {}, t: {}'.
        #    format(
        #        length,
        #        last_experience['position'],
        #        last_experience['reward'],
        #        last_experience['value'],
        #        last_experience['value_next'],
        #        last_experience['terminal']
        #    )
        #)
        #print('rollout size: {}, last r: {}'.format(len(rollout.position), rollout.r[-1]))
        #print('last value_next: ', last_experience['value_next'], ', rollout flushed.')

        # Once we have enough experience, yield it, and have the ThreadRunner place it on a queue:
        yield rollout

Пример #4

Показать файл

    def process(self, sess):
        """
        Grabs a on_policy_rollout that's been produced by the thread runner,
        samples off_policy rollout[s] from replay memory and updates the parameters.
        The update is then sent to the parameter server.
        """
        sess.run(self.sync)  # copy weights from shared to local

        # Get and process on_policy_rollout for A3C train step:
        on_policy_rollout = self.pull_batch_from_queue()
        on_policy_batch = on_policy_rollout.process(
            gamma=self.model_gamma, gae_lambda=self.model_gae_lambda)

        # Feeder for on-policy A3C loss estimation graph:
        feed_dict = {
            pl: value
            for pl, value in zip(self.local_network.a3c_lstm_state_pl_flatten,
                                 flatten_nested(on_policy_batch.features))
        }  # ..passes lstm context
        feed_dict.update({
            self.local_network.a3c_state_in: on_policy_batch.si,
            self.local_network.a3c_a_r_in: on_policy_batch.last_ar,
            self.a3c_act_target: on_policy_batch.a,
            self.a3c_adv_target: on_policy_batch.adv,
            self.a3c_r_target: on_policy_batch.r,
            self.local_network.train_phase: True,
        })

        if self.use_off_policy_a3c or self.use_pixel_control or self.use_value_replay:
            # Get sample from replay memory:
            if self.use_rebalanced_replay:
                off_policy_sample = self.memory.sample_priority(
                    self.replay_rollout_length,
                    skewness=self.rebalance_skewness,
                    exact_size=False)
            else:
                off_policy_sample = self.memory.sample_uniform(
                    self.replay_rollout_length)

            off_policy_rollout = Rollout()
            off_policy_rollout.add_memory_sample(off_policy_sample)
            off_policy_batch = off_policy_rollout.process(
                gamma=self.model_gamma, gae_lambda=self.model_gae_lambda)

            # Feeder for off-policy A3C loss estimation graph:
            off_policy_feeder = {
                pl: value
                for pl, value in zip(
                    self.local_network.off_a3c_lstm_state_pl_flatten,
                    flatten_nested(off_policy_batch.features))
            }
            off_policy_feeder.update({
                self.local_network.off_a3c_state_in:
                off_policy_batch.si,
                self.local_network.off_a3c_a_r_in:
                off_policy_batch.last_ar,
                self.off_policy_act_target:
                off_policy_batch.a,
                self.off_policy_adv_target:
                off_policy_batch.adv,
                self.off_policy_r_target:
                off_policy_batch.r,
            })
            feed_dict.update(off_policy_feeder)

        # Update with reward prediction subgraph:
        if self.use_reward_prediction:
            # Rebalanced 50/50 sample for RP:
            rp_sample = self.memory.sample_priority(self.rp_sequence_size,
                                                    skewness=2,
                                                    exact_size=True)
            feed_dict.update(self.process_rp(rp_sample))

        # Pixel control ...
        if self.use_pixel_control:
            feed_dict.update(self.process_pc(off_policy_batch))

        # VR...
        if self.use_value_replay:
            feed_dict.update(self.process_vr(off_policy_batch))

        if self.use_memory:
            # Save on_policy_rollout to replay memory:
            self.memory.add_rollout(on_policy_rollout)

        # Every worker writes model summaries:
        should_compute_summary =\
            self.local_steps % self.model_summary_freq == 0   # self.task == 0 and

        if should_compute_summary:
            fetches = [self.model_summary_op, self.train_op, self.global_step]
        else:
            fetches = [self.train_op, self.global_step]

        #print('TRAIN_FEED_DICT:\n', feed_dict)
        #print('\n=======S=======\n')
        #for key,value in feed_dict.items():
        #    try:
        #        print(key,':', value.shape,'\n')
        #    except:
        #        print(key, ':', value, '\n')
        #print('\n=====E======\n')

        # And finally...
        fetched = sess.run(fetches, feed_dict=feed_dict)

        if should_compute_summary:
            self.summary_writer.add_summary(tf.Summary.FromString(fetched[0]),
                                            fetched[-1])
            self.summary_writer.flush()

        self.local_steps += 1

Пример #5

Показать файл

def env_runner(sess, env, policy, task, num_local_steps, summary_writer,
               episode_summary_freq, env_render_freq, test, ep_summary):
    """
    The logic of the thread runner.  In brief, it constantly keeps on running
    the policy, and as long as the rollout exceeds a certain length, the thread
    runner appends the rollout to the queue.
    """
    last_state = env.reset()
    if not test:
        last_state = last_state['model_input']

    last_features = policy.get_a3c_initial_features()
    length = 0
    local_episode = 0
    rewards = 0
    last_action = np.zeros(env.action_space.n)
    last_action[0] = 1
    last_reward = 0.0
    last_action_reward = np.concatenate(
        [last_action, np.asarray([last_reward])], axis=-1)

    # Summary averages accumulators:
    total_r = 0
    cpu_time = 0
    final_value = 0
    total_steps = 0
    total_steps_atari = 0

    while True:
        terminal_end = False
        rollout = Rollout()

        # Partially collect first experience of rollout:
        action, value_, features = policy.a3c_act(last_state, last_features,
                                                  last_action_reward)

        # argmax to convert from one-hot:
        state, reward, terminal, info = env.step(action.argmax())
        if not test:
            state = state['model_input']
        # Estimate `pixel_change`:
        pixel_change = policy.get_pc_target(state, last_state)

        # Collect the experience:
        frame_position = {'episode': local_episode, 'step': length}
        last_experience = dict(
            position=frame_position,
            state=last_state,
            action=action,
            reward=reward,
            value=value_,
            terminal=terminal,
            features=last_features,
            pixel_change=pixel_change,
            last_action_reward=last_action_reward,  # as a[-1]
        )
        length += 1
        rewards += reward
        last_state = state
        last_features = features
        last_action = action
        last_reward = reward
        last_action_reward = np.concatenate(
            [last_action, np.asarray([last_reward])], axis=-1)

        for roll_step in range(1, num_local_steps):
            if not terminal:
                # Continue adding experiences to rollout:
                action, value_, features = policy.a3c_act(
                    last_state, last_features, last_action_reward)

                # argmax to convert from one-hot:
                state, reward, terminal, info = env.step(action.argmax())
                if not test:
                    state = state['model_input']
                pixel_change = policy.get_pc_target(state, last_state)

                # Partially collect next experience:
                frame_position = {'episode': local_episode, 'step': length}
                experience = dict(
                    position=frame_position,
                    state=last_state,
                    action=action,
                    reward=reward,
                    value=value_,
                    terminal=terminal,
                    features=last_features,
                    pixel_change=pixel_change,
                    last_action_reward=last_action_reward,
                )
                # Complete and push previous experience:
                last_experience['value_next'] = value_
                rollout.add(**last_experience)

                #print ('last_experience {}'.format(last_experience['position']))
                #for k,v in last_experience.items():
                #    try:
                #        print(k, 'shape: ', v.shape)
                #    except:
                #        try:
                #            print(k, 'type: ', type(v), 'len: ', len(v))
                #        except:
                #            print(k, 'type: ', type(v), 'value: ', v)

                #print('rollout_step: {}, last_exp/frame_pos: {}\nr: {}, v: {}, v_next: {}, t: {}'.
                #    format(
                #        length,
                #        last_experience['position'],
                #        last_experience['reward'],
                #        last_experience['value'],
                #        last_experience['value_next'],
                #        last_experience['terminal']
                #    )
                #)
                length += 1
                rewards += reward
                last_state = state
                last_features = features
                last_action = action
                last_reward = reward
                last_experience = experience

            if terminal:
                # Finished episode within last taken step:
                terminal_end = True
                #print("Episode finished. Sum of rewards: %d. Length: %d" % (rewards, length))

                # All environment-related summaries are here due to fact
                # only runner allowed to interact with environment:
                # Accumulate values for averaging:
                total_r += rewards
                total_steps_atari += length
                if not test:
                    episode_stat = env.get_stat()  # get episode statistic
                    last_i = info[0]  # pull most recent info
                    cpu_time += episode_stat['runtime'].total_seconds()
                    final_value += last_i['broker_value']
                    total_steps += episode_stat['length']

                # Episode statistic:
                if local_episode % episode_summary_freq == 0:
                    if not test:
                        # BTgym:

                        fetched_episode_stat = sess.run(
                            ep_summary['stat_op'],
                            feed_dict={
                                ep_summary['total_r_pl']:
                                total_r / episode_summary_freq,
                                ep_summary['cpu_time_pl']:
                                cpu_time / episode_summary_freq,
                                ep_summary['final_value_pl']:
                                final_value / episode_summary_freq,
                                ep_summary['steps_pl']:
                                total_steps / episode_summary_freq
                            })
                    else:
                        # Atari:
                        fetched_episode_stat = sess.run(
                            ep_summary['test_stat_op'],
                            feed_dict={
                                ep_summary['total_r_pl']:
                                total_r / episode_summary_freq,
                                ep_summary['steps_pl']:
                                total_steps_atari / episode_summary_freq
                            })
                    summary_writer.add_summary(fetched_episode_stat,
                                               sess.run(policy.global_episode))
                    summary_writer.flush()
                    total_r = 0
                    cpu_time = 0
                    final_value = 0
                    total_steps = 0
                    total_steps_atari = 0

                if task == 0 and local_episode % env_render_freq == 0:
                    if not test:
                        # Render environment (chief worker only, and not in atari test mode):
                        renderings = sess.run(
                            ep_summary['render_op'],
                            feed_dict={
                                ep_summary['render_human_pl']:
                                env.render('human')[None, :],
                                ep_summary['render_model_input_pl']:
                                env.render('model_input')[None, :],
                                ep_summary['render_episode_pl']:
                                env.render('episode')[None, :],
                            })
                    else:
                        # Atari:
                        renderings = sess.run(
                            ep_summary['test_render_op'],
                            feed_dict={
                                ep_summary['render_atari_pl']:
                                state[None, :] * 255
                            })

                    summary_writer.add_summary(renderings,
                                               sess.run(policy.global_episode))
                    summary_writer.flush()

                # New episode:
                last_state = env.reset()
                if not test:
                    last_state = last_state['model_input']

                last_features = policy.get_a3c_initial_features()
                length = 0
                rewards = 0
                last_action = np.zeros(env.action_space.n)
                last_action[0] = 1
                last_reward = 0.0

                # Increment global and local episode counts:
                sess.run(policy.inc_episode)
                local_episode += 1
                break

        # After rolling `num_local_steps` or less (if got `terminal`)
        # complete final experience of the rollout:
        if not terminal_end:
            #print('last_non_terminal_value_next_added')
            last_experience['value_next'] = np.asarray([
                policy.get_a3c_value(last_state, last_features,
                                     last_action_reward)
            ])

        else:
            #print('last_terminal_value_next_added')
            last_experience['value_next'] = np.asarray([0.0])

        rollout.add(**last_experience)

        #print('last_experience {}'.format(last_experience['position']))
        #for k, v in last_experience.items():
        #    try:
        #        print(k, 'shape: ', v.shape)
        #    except:
        #        try:
        #            print(k, 'type: ', type(v), 'len: ', len(v))
        #        except:
        #            print(k, 'type: ', type(v), 'value: ', v)

        #print('rollout_step: {}, last_exp/frame_pos: {}\nr: {}, v: {}, v_next: {}, t: {}'.
        #    format(
        #        length,
        #        last_experience['position'],
        #        last_experience['reward'],
        #        last_experience['value'],
        #        last_experience['value_next'],
        #        last_experience['terminal']
        #    )
        #)
        #print('rollout size: {}, last r: {}'.format(len(rollout.position), rollout.r[-1]))
        #print('last value_next: ', last_experience['value_next'], ', rollout flushed.')

        # Once we have enough experience, yield it, and have the ThreadRunner place it on a queue:
        yield rollout

Пример #6

Показать файл

    def process(self, sess):
        """
        Grabs a on_policy_rollout that's been produced by the thread runner,
        samples off_policy rollout[s] from replay memory and updates the parameters.
        The update is then sent to the parameter server.
        """

        # Copy weights from shared to local new_policy:
        sess.run(self.sync)

        # Get and process rollout for on-policy train step:
        on_policy_rollout = self.pull_batch_from_queue()
        on_policy_batch = on_policy_rollout.process(
            gamma=self.model_gamma, gae_lambda=self.model_gae_lambda)

        # Feeder for on-policy AAC loss estimation graph:
        feed_dict = {
            pl: value
            for pl, value in zip(self.local_network.on_lstm_state_pl_flatten,
                                 flatten_nested(on_policy_batch['context']))
        }
        feed_dict.update({
            self.local_network.on_state_in:
            on_policy_batch['state'],
            self.local_network.on_a_r_in:
            on_policy_batch['last_action_reward'],
            self.on_pi_act_target:
            on_policy_batch['action'],
            self.on_pi_adv_target:
            on_policy_batch['advantage'],
            self.on_pi_r_target:
            on_policy_batch['r'],
            self.local_network.train_phase:
            True,
        })

        if self.use_off_policy_aac or self.use_pixel_control or self.use_value_replay:
            # Get sample from replay memory:
            if self.use_rebalanced_replay:
                off_policy_sample = self.memory.sample_priority(
                    self.replay_rollout_length,
                    skewness=self.rebalance_skewness,
                    exact_size=False)
            else:
                off_policy_sample = self.memory.sample_uniform(
                    self.replay_rollout_length)

            off_policy_rollout = Rollout()
            off_policy_rollout.add_memory_sample(off_policy_sample)
            off_policy_batch = off_policy_rollout.process(
                gamma=self.model_gamma, gae_lambda=self.model_gae_lambda)

            # Feeder for off-policy AAC loss estimation graph:
            off_policy_feeder = {
                pl: value
                for pl, value in zip(
                    self.local_network.off_lstm_state_pl_flatten,
                    flatten_nested(off_policy_batch['context']))
            }

            off_policy_feeder.update({
                self.local_network.off_state_in:
                off_policy_batch['state'],
                self.local_network.off_a_r_in:
                off_policy_batch['last_action_reward'],
                self.off_pi_act_target:
                off_policy_batch['action'],
                self.off_pi_adv_target:
                off_policy_batch['advantage'],
                self.off_pi_r_target:
                off_policy_batch['r'],
            })
            feed_dict.update(off_policy_feeder)

        # Update with reward prediction subgraph:
        if self.use_reward_prediction:
            # Rebalanced 50/50 sample for RP:
            rp_sample = self.memory.sample_priority(self.rp_sequence_size,
                                                    skewness=2,
                                                    exact_size=True)
            feed_dict.update(self.process_rp(rp_sample))

        # Pixel control ...
        if self.use_pixel_control:
            feed_dict.update(self.process_pc(off_policy_batch))

        # VR...
        if self.use_value_replay:
            feed_dict.update(self.process_vr(off_policy_batch))

        if self.use_memory:
            # Save on_policy_rollout to replay memory:
            self.memory.add_rollout(on_policy_rollout)

        # Every worker writes model summaries:
        should_compute_summary =\
            self.local_steps % self.model_summary_freq == 0

        fetches = [self.train_op]

        if should_compute_summary:
            fetches = [self.train_op, self.model_summary_op, self.inc_step]
        else:
            fetches = [self.train_op, self.inc_step]

        fetched = sess.run(fetches, feed_dict=feed_dict)

        if should_compute_summary:
            self.summary_writer.add_summary(tf.Summary.FromString(fetched[-2]),
                                            fetched[-1])
            self.summary_writer.flush()

        self.local_steps += 1

Пример #7

Показать файл

    def _sample_priority(self,
                         size=None,
                         exact_size=False,
                         skewness=2,
                         sample_attempts=100):
        """
        Implements rebalanced replay.
        Samples sequence of successive frames from distribution skewed by means of reward of last sample frame.

        Args:
            size:               sample size, must be <= self.max_sample_size;
            exact_size:         whether accept sample with size less than 'size'
                                or re-sample to get sample of exact size (used for reward prediction task);
            skewness:           int>=1, sampling probability denominator, such as probability of sampling sequence with
                                last frame having non-zero reward is: p[non_zero]=1/skewness;
            sample_attempts:    if exact_size=True, sets number of re-sampling attempts
                                to get sample of continuous experiences (no `Terminal` frames inside except last one);
                                if number is reached - sample returned 'as is'.
        Returns:
            instance of Rollout().
        """
        if size is None:
            size = self.priority_sample_size

        if size > self.max_sample_size:
            size = self.max_sample_size

        # Toss skewed coin:
        if np.random.randint(int(skewness)) == 0:
            from_zero = False
        else:
            from_zero = True

        if len(self._zero_reward_indices) == 0:
            # zero rewards container was empty
            from_zero = False
        elif len(self._non_zero_reward_indices) == 0:
            # non zero rewards container was empty
            from_zero = True

        # Try to sample sequence of given length from one episode.
        # Take maximum of 'sample_attempts', if no luck
        # (e.g too short episodes and/or too big sampling size) ->
        # return inconsistent sample and issue warning.
        check_sequence = True
        for attempt in range(sample_attempts):
            if from_zero:
                index = np.random.randint(len(self._zero_reward_indices))
                end_frame_index = self._zero_reward_indices[index]

            else:
                index = np.random.randint(len(self._non_zero_reward_indices))
                end_frame_index = self._non_zero_reward_indices[index]

            start_frame_index = end_frame_index - size + 1
            raw_start_frame_index = start_frame_index - self._top_frame_index

            sampled_rollout = Rollout()
            is_full = True
            if attempt == sample_attempts - 1:
                check_sequence = False
                self.log.warning(
                    'Memory_{}: failed to sample {} successive frames, sampled as is.'
                    .format(self.task, size))

            for i in range(size - 1):
                frame = self._frames[raw_start_frame_index + i]
                sampled_rollout.add(frame)
                if check_sequence:
                    if frame['terminal']:
                        if exact_size:
                            is_full = False
                        #print('attempt:', attempt)
                        #print('frame.terminal:', frame['terminal'])
                        break
            # Last frame can be terminal anyway:
            frame = self._frames[raw_start_frame_index + size - 1]
            sampled_rollout.add(frame)

            if is_full:
                break

        return sampled_rollout