Пример #1
0
    def sample_uniform(self, sequence_size):
        """
        Uniformly samples sequence of successive frames of size `sequence_size` or less (~off-policy rollout).

        Args:
            sequence_size:  maximum sample size.
        Returns:
            instance of Rollout of size <= sequence_size.
        """
        start_pos = np.random.randint(0,
                                      self._history_size - sequence_size - 1)
        # Shift by one if hit terminal frame:
        if self._frames[start_pos]['terminal']:
            start_pos += 1  # assuming that there are no successive terminal frames.

        sampled_rollout = Rollout()

        for i in range(sequence_size):
            frame = self._frames[start_pos + i]
            sampled_rollout.add(frame)
            if frame['terminal']:
                break  # it's ok to return less than `sequence_size` frames if `terminal` frame encountered.

        return sampled_rollout
Пример #2
0
def env_runner(sess, env, policy, task, rollout_length, summary_writer,
               episode_summary_freq, env_render_freq, atari_test, ep_summary,
               memory_config):
    """
    The logic of the thread runner.
    In brief, it constantly keeps on running
    the policy, and as long as the rollout exceeds a certain length, the thread
    runner appends all the collected data to the queue.

    Args:
        env:                    environment instance
        policy:                 policy instance
        task:                   int
        rollout_length:         int
        episode_summary_freq:   int
        env_render_freq:        int
        atari_test:             bool, Atari or BTGyn
        ep_summary:             dict of tf.summary op and placeholders
        memory_config:          replay memory configuration dictionary

    Yelds:
        collected data as dictionary of on_policy, off_policy rollouts and episode statistics.
    """
    if memory_config is not None:
        memory = memory_config['class_ref'](**memory_config['kwargs'])

    else:
        memory = _DummyMemory()
    # Pass sample config to environment:
    last_state = env.reset(**policy.get_sample_config())
    last_context = policy.get_initial_features(state=last_state)
    length = 0
    local_episode = 0
    reward_sum = 0
    last_action = np.zeros(env.action_space.n)
    last_action[0] = 1
    last_reward = 0.0
    last_action_reward = np.concatenate(
        [last_action, np.asarray([last_reward])], axis=-1)

    # Summary averages accumulators:
    total_r = []
    cpu_time = []
    final_value = []
    total_steps = []
    total_steps_atari = []

    ep_stat = None
    test_ep_stat = None
    render_stat = None

    while True:
        terminal_end = False
        rollout = Rollout()

        action, value_, context = policy.act(last_state, last_context,
                                             last_action_reward)

        # argmax to convert from one-hot:
        state, reward, terminal, info = env.step(action.argmax())

        # Partially collect first experience of rollout:
        last_experience = {
            'position': {
                'episode': local_episode,
                'step': length
            },
            'state': last_state,
            'action': action,
            'reward': reward,
            'value': value_,
            'terminal': terminal,
            'context': last_context,
            'last_action_reward': last_action_reward,
        }
        # Execute user-defined callbacks to policy, if any:
        for key, callback in policy.callback.items():
            last_experience[key] = callback(**locals())

        length += 1
        reward_sum += reward
        last_state = state
        last_context = context
        last_action = action
        last_reward = reward
        last_action_reward = np.concatenate(
            [last_action, np.asarray([last_reward])], axis=-1)

        for roll_step in range(1, rollout_length):
            if not terminal:
                # Continue adding experiences to rollout:
                action, value_, context = policy.act(last_state, last_context,
                                                     last_action_reward)

                # Argmax to convert from one-hot:
                state, reward, terminal, info = env.step(action.argmax())
                #if not atari_test:
                #        state = state['model_input']

                # Partially collect next experience:
                experience = {
                    'position': {
                        'episode': local_episode,
                        'step': length
                    },
                    'state': last_state,
                    'action': action,
                    'reward': reward,
                    'value': value_,
                    'terminal': terminal,
                    'context': last_context,
                    'last_action_reward': last_action_reward,
                    #'pixel_change': 0 #policy.get_pc_target(state, last_state),
                }
                for key, callback in policy.callback.items():
                    experience[key] = callback(**locals())

                # Bootstrap to complete and push previous experience:
                last_experience['r'] = value_
                rollout.add(last_experience)
                memory.add(last_experience)

                # Housekeeping:
                length += 1
                reward_sum += reward
                last_state = state
                last_context = context
                last_action = action
                last_reward = reward
                last_action_reward = np.concatenate(
                    [last_action, np.asarray([last_reward])], axis=-1)
                last_experience = experience

            if terminal:
                # Finished episode within last taken step:
                terminal_end = True
                # All environment-specific summaries are here due to fact
                # only runner allowed to interact with environment:
                # Accumulate values for averaging:
                total_r += [reward_sum]
                total_steps_atari += [length]
                if not atari_test:
                    episode_stat = env.get_stat()  # get episode statistic
                    last_i = info[-1]  # pull most recent info
                    cpu_time += [episode_stat['runtime'].total_seconds()]
                    final_value += [last_i['broker_value']]
                    total_steps += [episode_stat['length']]
                #print('last_episode.metadata:', state['metadata'])

                # Episode statistics:
                try:
                    # Was it test episode ( `type` in metadata is not zero)?
                    if not atari_test and state['metadata']['type']:
                        is_test_episode = True

                    else:
                        is_test_episode = False

                except KeyError:
                    is_test_episode = False

                if is_test_episode:
                    #print(task, total_r)
                    test_ep_stat = dict(total_r=total_r[-1],
                                        final_value=final_value[-1],
                                        steps=total_steps[-1])
                else:
                    if local_episode % episode_summary_freq == 0:
                        if not atari_test:
                            # BTgym:
                            ep_stat = dict(total_r=np.average(total_r),
                                           cpu_time=np.average(cpu_time),
                                           final_value=np.average(final_value),
                                           steps=np.average(total_steps))
                        else:
                            # Atari:
                            ep_stat = dict(total_r=np.average(total_r),
                                           steps=np.average(total_steps_atari))
                        total_r = []
                        cpu_time = []
                        final_value = []
                        total_steps = []
                        total_steps_atari = []

                if task == 0 and local_episode % env_render_freq == 0:
                    if not atari_test:
                        # Render environment (chief worker only, and not in atari atari_test mode):
                        render_stat = {
                            mode: env.render(mode)[None, :]
                            for mode in env.render_modes
                        }
                    else:
                        # Atari:
                        render_stat = dict(
                            render_atari=state['external'][None, :] * 255)

                # New episode:
                last_state = env.reset(**policy.get_sample_config())
                last_context = policy.get_initial_features(
                    state=last_state, context=last_context)
                length = 0
                reward_sum = 0
                last_action = np.zeros(env.action_space.n)
                last_action[0] = 1
                last_reward = 0.0
                last_action_reward = np.concatenate(
                    [last_action, np.asarray([last_reward])], axis=-1)

                # Increment global and local episode counts:
                sess.run(policy.inc_episode)
                local_episode += 1
                break

        # After rolling `rollout_length` or less (if got `terminal`)
        # complete final experience of the rollout:
        if not terminal_end:
            # Bootstrap:
            last_experience['r'] = np.asarray([
                policy.get_value(last_state, last_context, last_action_reward)
            ])

        else:
            last_experience['r'] = np.asarray([0.0])

        rollout.add(last_experience)

        # Only training rollouts are added to replay memory:
        try:
            # Was it test (`type` in metadata is not zero)?
            if not atari_test and last_experience['state']['metadata']['type']:
                is_test = True

            else:
                is_test = False

        except KeyError:
            is_test = False

        if not is_test:
            memory.add(last_experience)

        #print('last_experience {}'.format(last_experience['position']))
        #for k, v in last_experience.items():
        #    try:
        #        print(k, 'shape: ', v.shape)
        #    except:
        #        try:
        #            print(k, 'type: ', type(v), 'len: ', len(v))
        #        except:
        #            print(k, 'type: ', type(v), 'value: ', v)

        #print('rollout_step: {}, last_exp/frame_pos: {}\nr: {}, v: {}, v_next: {}, t: {}'.
        #    format(
        #        length,
        #        last_experience['position'],
        #        last_experience['reward'],
        #        last_experience['value'],
        #        last_experience['value_next'],
        #        last_experience['terminal']
        #    )
        #)
        #print('rollout size: {}, last r: {}'.format(len(rollout.position), rollout.r[-1]))
        #print('last value_next: ', last_experience['value_next'], ', rollout flushed.')

        # Once we have enough experience and memory can be sampled, yield it,
        # and have the ThreadRunner place it on a queue:
        if memory.is_full():
            data = dict(
                on_policy=rollout,
                off_policy=memory.sample_uniform(sequence_size=rollout_length),
                off_policy_rp=memory.sample_priority(exact_size=True),
                ep_summary=ep_stat,
                test_ep_summary=test_ep_stat,
                render_summary=render_stat,
            )
            yield data

            ep_stat = None
            test_ep_stat = None
            render_stat = None
Пример #3
0
def env_runner(sess, env, policy, task, rollout_length, summary_writer,
               episode_summary_freq, env_render_freq, test, ep_summary):
    """The logic of the thread runner.
    In brief, it constantly keeps on running
    the policy, and as long as the rollout exceeds a certain length, the thread
    runner appends the rollout to the queue.

    Args:
        env:                    environment instance
        policy:                 policy instance
        task:                   int
        rollout_length:         int
        episode_summary_freq:   int
        env_render_freq:        int
        test:                   Atari or BTGyn
        ep_summary:             tf.summary

    Yelds:
        rollout instance
    """
    last_state = env.reset()
    if not test:
        last_state = last_state['model_input']

    last_context = policy.get_initial_features()
    length = 0
    local_episode = 0
    rewards = 0
    last_action = np.zeros(env.action_space.n)
    last_action[0] = 1
    last_reward = 0.0
    last_action_reward = np.concatenate(
        [last_action, np.asarray([last_reward])], axis=-1)

    # Summary averages accumulators:
    total_r = 0
    cpu_time = 0
    final_value = 0
    total_steps = 0
    total_steps_atari = 0

    while True:
        terminal_end = False
        rollout = Rollout()

        action, value_, context = policy.act(last_state, last_context,
                                             last_action_reward)

        # argmax to convert from one-hot:
        state, reward, terminal, info = env.step(action.argmax())
        #if not test:
        #    state = state['model_input']

        # Partially collect first experience of rollout:
        last_experience = {
            'position': {
                'episode': local_episode,
                'step': length
            },
            'state': last_state,
            'action': action,
            'reward': reward,
            'value': value_,
            'terminal': terminal,
            'context': last_context,
            'last_action_reward': last_action_reward,
            #'pixel_change': 0 #policy.get_pc_target(state, last_state),
        }
        # Execute user-defined callbacks to policy, if any:
        for key, callback in policy.callback.items():
            last_experience[key] = callback(**locals())

        length += 1
        rewards += reward
        last_state = state
        last_context = context
        last_action = action
        last_reward = reward
        last_action_reward = np.concatenate(
            [last_action, np.asarray([last_reward])], axis=-1)

        for roll_step in range(1, rollout_length):
            if not terminal:
                # Continue adding experiences to rollout:
                action, value_, context = policy.act(last_state, last_context,
                                                     last_action_reward)

                # Argmax to convert from one-hot:
                state, reward, terminal, info = env.step(action.argmax())
                #if not test:
                #        state = state['model_input']

                # Partially collect next experience:
                experience = {
                    'position': {
                        'episode': local_episode,
                        'step': length
                    },
                    'state': last_state,
                    'action': action,
                    'reward': reward,
                    'value': value_,
                    'terminal': terminal,
                    'context': last_context,
                    'last_action_reward': last_action_reward,
                    #'pixel_change': 0 #policy.get_pc_target(state, last_state),
                }
                for key, callback in policy.callback.items():
                    experience[key] = callback(**locals())

                # Bootstrap to complete and push previous experience:
                last_experience['r'] = value_
                rollout.add(last_experience)

                # Housekeeping:
                length += 1
                rewards += reward
                last_state = state
                last_context = context
                last_action = action
                last_reward = reward
                last_action_reward = np.concatenate(
                    [last_action, np.asarray([last_reward])], axis=-1)
                last_experience = experience

            if terminal:
                # Finished episode within last taken step:
                terminal_end = True
                # All environment-specific summaries are here due to fact
                # only runner allowed to interact with environment:
                # Accumulate values for averaging:
                total_r += rewards
                total_steps_atari += length
                if not test:
                    episode_stat = env.get_stat()  # get episode statistic
                    last_i = info[0]  # pull most recent info
                    cpu_time += episode_stat['runtime'].total_seconds()
                    final_value += last_i['broker_value']
                    total_steps += episode_stat['length']

                # Episode statistic:
                if local_episode % episode_summary_freq == 0:
                    if not test:
                        # BTgym:
                        fetched_episode_stat = sess.run(
                            ep_summary['stat_op'],
                            feed_dict={
                                ep_summary['total_r_pl']:
                                total_r / episode_summary_freq,
                                ep_summary['cpu_time_pl']:
                                cpu_time / episode_summary_freq,
                                ep_summary['final_value_pl']:
                                final_value / episode_summary_freq,
                                ep_summary['steps_pl']:
                                total_steps / episode_summary_freq
                            })
                    else:
                        # Atari:
                        fetched_episode_stat = sess.run(
                            ep_summary['test_stat_op'],
                            feed_dict={
                                ep_summary['total_r_pl']:
                                total_r / episode_summary_freq,
                                ep_summary['steps_pl']:
                                total_steps_atari / episode_summary_freq
                            })
                    summary_writer.add_summary(fetched_episode_stat,
                                               sess.run(policy.global_episode))
                    summary_writer.flush()
                    total_r = 0
                    cpu_time = 0
                    final_value = 0
                    total_steps = 0
                    total_steps_atari = 0

                if task == 0 and local_episode % env_render_freq == 0:
                    if not test:
                        # Render environment (chief worker only, and not in atari test mode):
                        renderings = sess.run(
                            ep_summary['render_op'],
                            feed_dict={
                                ep_summary['render_human_pl']:
                                env.render('human')[None, :],
                                ep_summary['render_model_input_pl']:
                                env.render('model_input')[None, :],
                                ep_summary['render_episode_pl']:
                                env.render('episode')[None, :],
                            })
                    else:
                        # Atari:
                        renderings = sess.run(
                            ep_summary['test_render_op'],
                            feed_dict={
                                ep_summary['render_atari_pl']:
                                state[None, :] * 255
                            })

                    summary_writer.add_summary(renderings,
                                               sess.run(policy.global_episode))
                    summary_writer.flush()

                # New episode:
                last_state = env.reset()
                #if not test:
                #    last_state = last_state['model_input']

                last_context = policy.get_initial_features()
                length = 0
                rewards = 0
                last_action = np.zeros(env.action_space.n)
                last_action[0] = 1
                last_reward = 0.0
                last_action_reward = np.concatenate(
                    [last_action, np.asarray([last_reward])], axis=-1)

                # Increment global and local episode counts:
                sess.run(policy.inc_episode)
                local_episode += 1
                break

        # After rolling `rollout_length` or less (if got `terminal`)
        # complete final experience of the rollout:
        if not terminal_end:
            # Bootstrap:
            last_experience['r'] = np.asarray([
                policy.get_value(last_state, last_context, last_action_reward)
            ])

        else:
            last_experience['r'] = np.asarray([0.0])

        rollout.add(last_experience)

        #print('last_experience {}'.format(last_experience['position']))
        #for k, v in last_experience.items():
        #    try:
        #        print(k, 'shape: ', v.shape)
        #    except:
        #        try:
        #            print(k, 'type: ', type(v), 'len: ', len(v))
        #        except:
        #            print(k, 'type: ', type(v), 'value: ', v)

        #print('rollout_step: {}, last_exp/frame_pos: {}\nr: {}, v: {}, v_next: {}, t: {}'.
        #    format(
        #        length,
        #        last_experience['position'],
        #        last_experience['reward'],
        #        last_experience['value'],
        #        last_experience['value_next'],
        #        last_experience['terminal']
        #    )
        #)
        #print('rollout size: {}, last r: {}'.format(len(rollout.position), rollout.r[-1]))
        #print('last value_next: ', last_experience['value_next'], ', rollout flushed.')

        # Once we have enough experience, yield it, and have the ThreadRunner place it on a queue:
        yield rollout
Пример #4
0
    def process(self, sess):
        """
        Grabs a on_policy_rollout that's been produced by the thread runner,
        samples off_policy rollout[s] from replay memory and updates the parameters.
        The update is then sent to the parameter server.
        """
        sess.run(self.sync)  # copy weights from shared to local

        # Get and process on_policy_rollout for A3C train step:
        on_policy_rollout = self.pull_batch_from_queue()
        on_policy_batch = on_policy_rollout.process(
            gamma=self.model_gamma, gae_lambda=self.model_gae_lambda)

        # Feeder for on-policy A3C loss estimation graph:
        feed_dict = {
            pl: value
            for pl, value in zip(self.local_network.a3c_lstm_state_pl_flatten,
                                 flatten_nested(on_policy_batch.features))
        }  # ..passes lstm context
        feed_dict.update({
            self.local_network.a3c_state_in: on_policy_batch.si,
            self.local_network.a3c_a_r_in: on_policy_batch.last_ar,
            self.a3c_act_target: on_policy_batch.a,
            self.a3c_adv_target: on_policy_batch.adv,
            self.a3c_r_target: on_policy_batch.r,
            self.local_network.train_phase: True,
        })

        if self.use_off_policy_a3c or self.use_pixel_control or self.use_value_replay:
            # Get sample from replay memory:
            if self.use_rebalanced_replay:
                off_policy_sample = self.memory.sample_priority(
                    self.replay_rollout_length,
                    skewness=self.rebalance_skewness,
                    exact_size=False)
            else:
                off_policy_sample = self.memory.sample_uniform(
                    self.replay_rollout_length)

            off_policy_rollout = Rollout()
            off_policy_rollout.add_memory_sample(off_policy_sample)
            off_policy_batch = off_policy_rollout.process(
                gamma=self.model_gamma, gae_lambda=self.model_gae_lambda)

            # Feeder for off-policy A3C loss estimation graph:
            off_policy_feeder = {
                pl: value
                for pl, value in zip(
                    self.local_network.off_a3c_lstm_state_pl_flatten,
                    flatten_nested(off_policy_batch.features))
            }
            off_policy_feeder.update({
                self.local_network.off_a3c_state_in:
                off_policy_batch.si,
                self.local_network.off_a3c_a_r_in:
                off_policy_batch.last_ar,
                self.off_policy_act_target:
                off_policy_batch.a,
                self.off_policy_adv_target:
                off_policy_batch.adv,
                self.off_policy_r_target:
                off_policy_batch.r,
            })
            feed_dict.update(off_policy_feeder)

        # Update with reward prediction subgraph:
        if self.use_reward_prediction:
            # Rebalanced 50/50 sample for RP:
            rp_sample = self.memory.sample_priority(self.rp_sequence_size,
                                                    skewness=2,
                                                    exact_size=True)
            feed_dict.update(self.process_rp(rp_sample))

        # Pixel control ...
        if self.use_pixel_control:
            feed_dict.update(self.process_pc(off_policy_batch))

        # VR...
        if self.use_value_replay:
            feed_dict.update(self.process_vr(off_policy_batch))

        if self.use_memory:
            # Save on_policy_rollout to replay memory:
            self.memory.add_rollout(on_policy_rollout)

        # Every worker writes model summaries:
        should_compute_summary =\
            self.local_steps % self.model_summary_freq == 0   # self.task == 0 and

        if should_compute_summary:
            fetches = [self.model_summary_op, self.train_op, self.global_step]
        else:
            fetches = [self.train_op, self.global_step]

        #print('TRAIN_FEED_DICT:\n', feed_dict)
        #print('\n=======S=======\n')
        #for key,value in feed_dict.items():
        #    try:
        #        print(key,':', value.shape,'\n')
        #    except:
        #        print(key, ':', value, '\n')
        #print('\n=====E======\n')

        # And finally...
        fetched = sess.run(fetches, feed_dict=feed_dict)

        if should_compute_summary:
            self.summary_writer.add_summary(tf.Summary.FromString(fetched[0]),
                                            fetched[-1])
            self.summary_writer.flush()

        self.local_steps += 1
Пример #5
0
def env_runner(sess, env, policy, task, num_local_steps, summary_writer,
               episode_summary_freq, env_render_freq, test, ep_summary):
    """
    The logic of the thread runner.  In brief, it constantly keeps on running
    the policy, and as long as the rollout exceeds a certain length, the thread
    runner appends the rollout to the queue.
    """
    last_state = env.reset()
    if not test:
        last_state = last_state['model_input']

    last_features = policy.get_a3c_initial_features()
    length = 0
    local_episode = 0
    rewards = 0
    last_action = np.zeros(env.action_space.n)
    last_action[0] = 1
    last_reward = 0.0
    last_action_reward = np.concatenate(
        [last_action, np.asarray([last_reward])], axis=-1)

    # Summary averages accumulators:
    total_r = 0
    cpu_time = 0
    final_value = 0
    total_steps = 0
    total_steps_atari = 0

    while True:
        terminal_end = False
        rollout = Rollout()

        # Partially collect first experience of rollout:
        action, value_, features = policy.a3c_act(last_state, last_features,
                                                  last_action_reward)

        # argmax to convert from one-hot:
        state, reward, terminal, info = env.step(action.argmax())
        if not test:
            state = state['model_input']
        # Estimate `pixel_change`:
        pixel_change = policy.get_pc_target(state, last_state)

        # Collect the experience:
        frame_position = {'episode': local_episode, 'step': length}
        last_experience = dict(
            position=frame_position,
            state=last_state,
            action=action,
            reward=reward,
            value=value_,
            terminal=terminal,
            features=last_features,
            pixel_change=pixel_change,
            last_action_reward=last_action_reward,  # as a[-1]
        )
        length += 1
        rewards += reward
        last_state = state
        last_features = features
        last_action = action
        last_reward = reward
        last_action_reward = np.concatenate(
            [last_action, np.asarray([last_reward])], axis=-1)

        for roll_step in range(1, num_local_steps):
            if not terminal:
                # Continue adding experiences to rollout:
                action, value_, features = policy.a3c_act(
                    last_state, last_features, last_action_reward)

                # argmax to convert from one-hot:
                state, reward, terminal, info = env.step(action.argmax())
                if not test:
                    state = state['model_input']
                pixel_change = policy.get_pc_target(state, last_state)

                # Partially collect next experience:
                frame_position = {'episode': local_episode, 'step': length}
                experience = dict(
                    position=frame_position,
                    state=last_state,
                    action=action,
                    reward=reward,
                    value=value_,
                    terminal=terminal,
                    features=last_features,
                    pixel_change=pixel_change,
                    last_action_reward=last_action_reward,
                )
                # Complete and push previous experience:
                last_experience['value_next'] = value_
                rollout.add(**last_experience)

                #print ('last_experience {}'.format(last_experience['position']))
                #for k,v in last_experience.items():
                #    try:
                #        print(k, 'shape: ', v.shape)
                #    except:
                #        try:
                #            print(k, 'type: ', type(v), 'len: ', len(v))
                #        except:
                #            print(k, 'type: ', type(v), 'value: ', v)

                #print('rollout_step: {}, last_exp/frame_pos: {}\nr: {}, v: {}, v_next: {}, t: {}'.
                #    format(
                #        length,
                #        last_experience['position'],
                #        last_experience['reward'],
                #        last_experience['value'],
                #        last_experience['value_next'],
                #        last_experience['terminal']
                #    )
                #)
                length += 1
                rewards += reward
                last_state = state
                last_features = features
                last_action = action
                last_reward = reward
                last_experience = experience

            if terminal:
                # Finished episode within last taken step:
                terminal_end = True
                #print("Episode finished. Sum of rewards: %d. Length: %d" % (rewards, length))

                # All environment-related summaries are here due to fact
                # only runner allowed to interact with environment:
                # Accumulate values for averaging:
                total_r += rewards
                total_steps_atari += length
                if not test:
                    episode_stat = env.get_stat()  # get episode statistic
                    last_i = info[0]  # pull most recent info
                    cpu_time += episode_stat['runtime'].total_seconds()
                    final_value += last_i['broker_value']
                    total_steps += episode_stat['length']

                # Episode statistic:
                if local_episode % episode_summary_freq == 0:
                    if not test:
                        # BTgym:

                        fetched_episode_stat = sess.run(
                            ep_summary['stat_op'],
                            feed_dict={
                                ep_summary['total_r_pl']:
                                total_r / episode_summary_freq,
                                ep_summary['cpu_time_pl']:
                                cpu_time / episode_summary_freq,
                                ep_summary['final_value_pl']:
                                final_value / episode_summary_freq,
                                ep_summary['steps_pl']:
                                total_steps / episode_summary_freq
                            })
                    else:
                        # Atari:
                        fetched_episode_stat = sess.run(
                            ep_summary['test_stat_op'],
                            feed_dict={
                                ep_summary['total_r_pl']:
                                total_r / episode_summary_freq,
                                ep_summary['steps_pl']:
                                total_steps_atari / episode_summary_freq
                            })
                    summary_writer.add_summary(fetched_episode_stat,
                                               sess.run(policy.global_episode))
                    summary_writer.flush()
                    total_r = 0
                    cpu_time = 0
                    final_value = 0
                    total_steps = 0
                    total_steps_atari = 0

                if task == 0 and local_episode % env_render_freq == 0:
                    if not test:
                        # Render environment (chief worker only, and not in atari test mode):
                        renderings = sess.run(
                            ep_summary['render_op'],
                            feed_dict={
                                ep_summary['render_human_pl']:
                                env.render('human')[None, :],
                                ep_summary['render_model_input_pl']:
                                env.render('model_input')[None, :],
                                ep_summary['render_episode_pl']:
                                env.render('episode')[None, :],
                            })
                    else:
                        # Atari:
                        renderings = sess.run(
                            ep_summary['test_render_op'],
                            feed_dict={
                                ep_summary['render_atari_pl']:
                                state[None, :] * 255
                            })

                    summary_writer.add_summary(renderings,
                                               sess.run(policy.global_episode))
                    summary_writer.flush()

                # New episode:
                last_state = env.reset()
                if not test:
                    last_state = last_state['model_input']

                last_features = policy.get_a3c_initial_features()
                length = 0
                rewards = 0
                last_action = np.zeros(env.action_space.n)
                last_action[0] = 1
                last_reward = 0.0

                # Increment global and local episode counts:
                sess.run(policy.inc_episode)
                local_episode += 1
                break

        # After rolling `num_local_steps` or less (if got `terminal`)
        # complete final experience of the rollout:
        if not terminal_end:
            #print('last_non_terminal_value_next_added')
            last_experience['value_next'] = np.asarray([
                policy.get_a3c_value(last_state, last_features,
                                     last_action_reward)
            ])

        else:
            #print('last_terminal_value_next_added')
            last_experience['value_next'] = np.asarray([0.0])

        rollout.add(**last_experience)

        #print('last_experience {}'.format(last_experience['position']))
        #for k, v in last_experience.items():
        #    try:
        #        print(k, 'shape: ', v.shape)
        #    except:
        #        try:
        #            print(k, 'type: ', type(v), 'len: ', len(v))
        #        except:
        #            print(k, 'type: ', type(v), 'value: ', v)

        #print('rollout_step: {}, last_exp/frame_pos: {}\nr: {}, v: {}, v_next: {}, t: {}'.
        #    format(
        #        length,
        #        last_experience['position'],
        #        last_experience['reward'],
        #        last_experience['value'],
        #        last_experience['value_next'],
        #        last_experience['terminal']
        #    )
        #)
        #print('rollout size: {}, last r: {}'.format(len(rollout.position), rollout.r[-1]))
        #print('last value_next: ', last_experience['value_next'], ', rollout flushed.')

        # Once we have enough experience, yield it, and have the ThreadRunner place it on a queue:
        yield rollout
Пример #6
0
    def process(self, sess):
        """
        Grabs a on_policy_rollout that's been produced by the thread runner,
        samples off_policy rollout[s] from replay memory and updates the parameters.
        The update is then sent to the parameter server.
        """

        # Copy weights from shared to local new_policy:
        sess.run(self.sync)

        # Get and process rollout for on-policy train step:
        on_policy_rollout = self.pull_batch_from_queue()
        on_policy_batch = on_policy_rollout.process(
            gamma=self.model_gamma, gae_lambda=self.model_gae_lambda)

        # Feeder for on-policy AAC loss estimation graph:
        feed_dict = {
            pl: value
            for pl, value in zip(self.local_network.on_lstm_state_pl_flatten,
                                 flatten_nested(on_policy_batch['context']))
        }
        feed_dict.update({
            self.local_network.on_state_in:
            on_policy_batch['state'],
            self.local_network.on_a_r_in:
            on_policy_batch['last_action_reward'],
            self.on_pi_act_target:
            on_policy_batch['action'],
            self.on_pi_adv_target:
            on_policy_batch['advantage'],
            self.on_pi_r_target:
            on_policy_batch['r'],
            self.local_network.train_phase:
            True,
        })

        if self.use_off_policy_aac or self.use_pixel_control or self.use_value_replay:
            # Get sample from replay memory:
            if self.use_rebalanced_replay:
                off_policy_sample = self.memory.sample_priority(
                    self.replay_rollout_length,
                    skewness=self.rebalance_skewness,
                    exact_size=False)
            else:
                off_policy_sample = self.memory.sample_uniform(
                    self.replay_rollout_length)

            off_policy_rollout = Rollout()
            off_policy_rollout.add_memory_sample(off_policy_sample)
            off_policy_batch = off_policy_rollout.process(
                gamma=self.model_gamma, gae_lambda=self.model_gae_lambda)

            # Feeder for off-policy AAC loss estimation graph:
            off_policy_feeder = {
                pl: value
                for pl, value in zip(
                    self.local_network.off_lstm_state_pl_flatten,
                    flatten_nested(off_policy_batch['context']))
            }

            off_policy_feeder.update({
                self.local_network.off_state_in:
                off_policy_batch['state'],
                self.local_network.off_a_r_in:
                off_policy_batch['last_action_reward'],
                self.off_pi_act_target:
                off_policy_batch['action'],
                self.off_pi_adv_target:
                off_policy_batch['advantage'],
                self.off_pi_r_target:
                off_policy_batch['r'],
            })
            feed_dict.update(off_policy_feeder)

        # Update with reward prediction subgraph:
        if self.use_reward_prediction:
            # Rebalanced 50/50 sample for RP:
            rp_sample = self.memory.sample_priority(self.rp_sequence_size,
                                                    skewness=2,
                                                    exact_size=True)
            feed_dict.update(self.process_rp(rp_sample))

        # Pixel control ...
        if self.use_pixel_control:
            feed_dict.update(self.process_pc(off_policy_batch))

        # VR...
        if self.use_value_replay:
            feed_dict.update(self.process_vr(off_policy_batch))

        if self.use_memory:
            # Save on_policy_rollout to replay memory:
            self.memory.add_rollout(on_policy_rollout)

        # Every worker writes model summaries:
        should_compute_summary =\
            self.local_steps % self.model_summary_freq == 0

        fetches = [self.train_op]

        if should_compute_summary:
            fetches = [self.train_op, self.model_summary_op, self.inc_step]
        else:
            fetches = [self.train_op, self.inc_step]

        fetched = sess.run(fetches, feed_dict=feed_dict)

        if should_compute_summary:
            self.summary_writer.add_summary(tf.Summary.FromString(fetched[-2]),
                                            fetched[-1])
            self.summary_writer.flush()

        self.local_steps += 1
Пример #7
0
    def _sample_priority(self,
                         size=None,
                         exact_size=False,
                         skewness=2,
                         sample_attempts=100):
        """
        Implements rebalanced replay.
        Samples sequence of successive frames from distribution skewed by means of reward of last sample frame.

        Args:
            size:               sample size, must be <= self.max_sample_size;
            exact_size:         whether accept sample with size less than 'size'
                                or re-sample to get sample of exact size (used for reward prediction task);
            skewness:           int>=1, sampling probability denominator, such as probability of sampling sequence with
                                last frame having non-zero reward is: p[non_zero]=1/skewness;
            sample_attempts:    if exact_size=True, sets number of re-sampling attempts
                                to get sample of continuous experiences (no `Terminal` frames inside except last one);
                                if number is reached - sample returned 'as is'.
        Returns:
            instance of Rollout().
        """
        if size is None:
            size = self.priority_sample_size

        if size > self.max_sample_size:
            size = self.max_sample_size

        # Toss skewed coin:
        if np.random.randint(int(skewness)) == 0:
            from_zero = False
        else:
            from_zero = True

        if len(self._zero_reward_indices) == 0:
            # zero rewards container was empty
            from_zero = False
        elif len(self._non_zero_reward_indices) == 0:
            # non zero rewards container was empty
            from_zero = True

        # Try to sample sequence of given length from one episode.
        # Take maximum of 'sample_attempts', if no luck
        # (e.g too short episodes and/or too big sampling size) ->
        # return inconsistent sample and issue warning.
        check_sequence = True
        for attempt in range(sample_attempts):
            if from_zero:
                index = np.random.randint(len(self._zero_reward_indices))
                end_frame_index = self._zero_reward_indices[index]

            else:
                index = np.random.randint(len(self._non_zero_reward_indices))
                end_frame_index = self._non_zero_reward_indices[index]

            start_frame_index = end_frame_index - size + 1
            raw_start_frame_index = start_frame_index - self._top_frame_index

            sampled_rollout = Rollout()
            is_full = True
            if attempt == sample_attempts - 1:
                check_sequence = False
                self.log.warning(
                    'Memory_{}: failed to sample {} successive frames, sampled as is.'
                    .format(self.task, size))

            for i in range(size - 1):
                frame = self._frames[raw_start_frame_index + i]
                sampled_rollout.add(frame)
                if check_sequence:
                    if frame['terminal']:
                        if exact_size:
                            is_full = False
                        #print('attempt:', attempt)
                        #print('frame.terminal:', frame['terminal'])
                        break
            # Last frame can be terminal anyway:
            frame = self._frames[raw_start_frame_index + size - 1]
            sampled_rollout.add(frame)

            if is_full:
                break

        return sampled_rollout