예제 #1
0
    def _add_shortcuts(self, m, pairwise_distances):
        self.remove_shortcuts(m)  # first - remove all existing shortcuts

        shortcuts = self._shortcuts_distance(
            m,
            pairwise_distances,
            self.params.min_shortcut_dist,
            self.params.shortcut_window,
        )
        if len(shortcuts) <= 0:
            log.warning('Could not find any shortcuts')
            return

        random.shuffle(shortcuts)
        shortcut_risks = [s[0] for s in shortcuts]

        shortcuts_to_keep = int(self.params.shortcuts_to_keep_fraction *
                                m.num_landmarks())

        keep = min(shortcuts_to_keep, len(shortcuts))
        percentile = (keep / len(shortcuts)) * 100
        max_risk = np.percentile(shortcut_risks, percentile)
        max_risk = min(max_risk, self.params.shortcut_risk_threshold)

        log.debug('Keep shortcuts with risk <= %.3f...', max_risk)
        shortcuts = [s for s in shortcuts if s[0] <= max_risk][:keep]
        shortcuts.sort(key=lambda x: x[-1], reverse=True
                       )  # sort according to ground truth distance for logging

        log.debug('Kept %d shortcuts: %r...', len(shortcuts), shortcuts[:5])

        for shortcut in shortcuts:
            risk, i1, i2, d, coord_dist = shortcut
            m.add_edge(i1, i2, loop_closure=True)
예제 #2
0
    def _learner_load_model(self, policy_id, replacement_policy):
        log.debug('Asking learner %d to load model from %d', policy_id,
                  replacement_policy)

        load_task = (PbtTask.LOAD_MODEL, (policy_id, replacement_policy))
        learner_worker = self.learner_workers[policy_id]
        learner_worker.task_queue.put((TaskType.PBT, load_task))
예제 #3
0
def make_voxel_env(env_name, cfg=None, env_config=None, **kwargs):
    scenario_name = env_name.split('voxel_env_')[-1].casefold()
    log.debug('Using scenario %s', scenario_name)

    if 'multitask' in scenario_name:
        if env_config is not None and 'worker_index' in env_config:
            task_idx = env_config['worker_index']
        else:
            log.warning(
                'Could not find information about task id. Use task_id=0. (It is okay if this message appears once)'
            )
            task_idx = 0

        env = make_env_multitask(
            scenario_name,
            task_idx,
            num_envs=cfg.voxel_num_envs_per_instance,
            num_agents_per_env=cfg.voxel_num_agents_per_env,
            num_simulation_threads=cfg.voxel_num_simulation_threads,
            use_vulkan=cfg.voxel_use_vulkan,
        )
    else:
        env = VoxelEnv(
            scenario_name=scenario_name,
            num_envs=cfg.voxel_num_envs_per_instance,
            num_agents_per_env=cfg.voxel_num_agents_per_env,
            num_simulation_threads=cfg.voxel_num_simulation_threads,
            use_vulkan=cfg.voxel_use_vulkan,
        )

    env = Wrapper(env, cfg.voxel_increase_team_spirit,
                  cfg.voxel_max_team_spirit_steps)
    return env
예제 #4
0
    def print_stats(self, fps, sample_throughput, total_env_steps):
        fps_str = []
        for interval, fps_value in zip(self.avg_stats_intervals, fps):
            fps_str.append(
                f'{int(interval * self.report_interval)} sec: {fps_value:.1f}')
        fps_str = f'({", ".join(fps_str)})'

        samples_per_policy = ', '.join(
            [f'{p}: {s:.1f}' for p, s in sample_throughput.items()])

        lag_stats = self.policy_lag[0]
        lag = AttrDict()
        for key in ['min', 'avg', 'max']:
            lag[key] = lag_stats.get(f'version_diff_{key}', -1)
        policy_lag_str = f'min: {lag.min:.1f}, avg: {lag.avg:.1f}, max: {lag.max:.1f}'

        log.debug(
            'Fps is %s. Total num frames: %d. Throughput: %s. Samples: %d. Policy #0 lag: (%s)',
            fps_str,
            total_env_steps,
            samples_per_policy,
            sum(self.samples_collected),
            policy_lag_str,
        )

        if 'reward' in self.policy_avg_stats:
            policy_reward_stats = []
            for policy_id in range(self.cfg.num_policies):
                reward_stats = self.policy_avg_stats['reward'][policy_id]
                if len(reward_stats) > 0:
                    policy_reward_stats.append(
                        (policy_id, f'{np.mean(reward_stats):.3f}'))
            log.debug('Avg episode reward: %r', policy_reward_stats)
예제 #5
0
 def _save_reward_shaping(self, policy_id):
     policy_reward_shaping_filename = policy_reward_shaping_file(
         self.cfg, policy_id)
     with open(policy_reward_shaping_filename, 'w') as json_file:
         log.debug('Saving policy-specific reward shaping %d to file %s',
                   policy_id, policy_reward_shaping_filename)
         json.dump(self.policy_reward_shaping[policy_id], json_file)
예제 #6
0
def load_from_checkpoint(cfg):
    filename = cfg_file(cfg)
    if not os.path.isfile(filename):
        raise Exception(
            f'Could not load saved parameters for experiment {cfg.experiment}')

    with open(filename, 'r') as json_file:
        json_params = json.load(json_file)
        log.warning('Loading existing experiment configuration from %s',
                    filename)
        loaded_cfg = AttrDict(json_params)

    # override the parameters in config file with values passed from command line
    for key, value in cfg.cli_args.items():
        if key in loaded_cfg and loaded_cfg[key] != value:
            log.debug(
                'Overriding arg %r with value %r passed from command line',
                key, value)
            loaded_cfg[key] = value

    # incorporate extra CLI parameters that were not present in JSON file
    for key, value in vars(cfg).items():
        if key not in loaded_cfg:
            log.debug(
                'Adding new argument %r=%r that is not in the saved config file!',
                key, value)
            loaded_cfg[key] = value

    return loaded_cfg
예제 #7
0
    def _save(self):
        checkpoint = self._get_checkpoint_dict()
        assert checkpoint is not None

        checkpoint_dir = self.checkpoint_dir(self.cfg, self.policy_id)
        tmp_filepath = join(checkpoint_dir, '.temp_checkpoint')
        checkpoint_name = f'checkpoint_{self.train_step:09d}_{self.env_steps}.pth'
        filepath = join(checkpoint_dir, checkpoint_name)
        log.info('Saving %s...', tmp_filepath)
        torch.save(checkpoint, tmp_filepath)
        log.info('Renaming %s to %s', tmp_filepath, filepath)
        os.rename(tmp_filepath, filepath)

        while len(self.get_checkpoints(
                checkpoint_dir)) > self.cfg.keep_checkpoints:
            oldest_checkpoint = self.get_checkpoints(checkpoint_dir)[0]
            if os.path.isfile(oldest_checkpoint):
                log.debug('Removing %s', oldest_checkpoint)
                os.remove(oldest_checkpoint)

        if self.cfg.save_milestones_sec > 0:
            # milestones enabled
            if time.time(
            ) - self.last_milestone_time >= self.cfg.save_milestones_sec:
                milestones_dir = ensure_dir_exists(
                    join(checkpoint_dir, 'milestones'))
                milestone_path = join(milestones_dir,
                                      f'{checkpoint_name}.milestone')
                log.debug('Saving a milestone %s', milestone_path)
                shutil.copy(filepath, milestone_path)
                self.last_milestone_time = time.time()
예제 #8
0
    def train(self, latest_batch_of_experience, env_steps, agent):
        # latest batch of experience is not used here

        if self.params.distance_network_checkpoint is None:
            # don't train distance net if it's already provided

            self.distance_buffer.extract_data(
                self.trajectory_buffer.complete_trajectories)

            if env_steps - self._last_trained > self.params.distance_train_interval:
                if self.distance_buffer.has_enough_data():
                    self.distance.train(self.distance_buffer.buffer, env_steps,
                                        agent)
                    self._last_trained = env_steps

                    # discard old experience
                    self.distance_buffer.reset()

                    # invalidate observation features because distance network has changed
                    self.distance.obs_encoder.reset()

        if env_steps > self.params.distance_bootstrap and not self.is_initialized(
        ):
            log.debug('Curiosity is initialized @ %d steps!', env_steps)
            self.initialized = True

        self._expand_explored_region(env_steps, agent)
예제 #9
0
    def _game_init(self, with_locking=True, max_parallel=10):
        lock_file = lock = None
        if with_locking:
            lock_file = doom_lock_file(max_parallel)
            lock = FileLock(lock_file)

        init_attempt = 0
        while True:
            init_attempt += 1
            try:
                if with_locking:
                    with lock.acquire(timeout=20):
                        self.game.init()
                else:
                    self.game.init()

                break
            except Timeout:
                if with_locking:
                    log.debug(
                        'Another process currently holds the lock %s, attempt: %d',
                        lock_file,
                        init_attempt,
                    )
            except Exception as exc:
                log.warning('VizDoom game.init() threw an exception %r. Terminate process...', exc)
                from envs.env_utils import EnvCriticalError
                raise EnvCriticalError()
예제 #10
0
    def test_buffer_performance(self):
        small_buffer = Buffer()
        small_buffer.add_many(obs=np.zeros([1000, 84, 84, 3], dtype=np.uint8))

        buffer = Buffer()

        t = Timing()

        with t.timeit('add'):
            for i in range(100):
                buffer.add_buff(small_buffer)

        huge_buffer = Buffer()
        with t.timeit('add_huge'):
            huge_buffer.add_buff(buffer)
            huge_buffer.add_buff(buffer)

        with t.timeit('single_add_small'):
            huge_buffer.add_buff(small_buffer)

        with t.timeit('clear_and_add'):
            huge_buffer.clear()
            huge_buffer.add_buff(buffer)
            huge_buffer.add_buff(buffer)

        with t.timeit('shuffle_and_add'):
            huge_buffer.clear()
            huge_buffer.add_buff(buffer)
            huge_buffer.add_buff(small_buffer)
            with t.timeit('shuffle'):
                huge_buffer.shuffle_data()

        log.debug('Timing: %s', t)
예제 #11
0
    def train_feed_dict(self, env, data, params, use_gpu):
        num_batches = len(data.obs) // params.batch_size
        g = self.setup_graph(env, params, use_dataset=False)

        config = tf.ConfigProto(device_count={'GPU': 100 if use_gpu else 0})
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())

            for _ in range(params.ppo_epochs):
                epoch_starts = time.time()
                for i in range(num_batches):
                    start, end = i * params.batch_size, (i +
                                                         1) * params.batch_size
                    kl, _ = sess.run(
                        [g.objectives.sample_kl, g.train_op],
                        feed_dict={
                            g.ph_observations: data.obs[start:end],
                            g.ph_actions: data.act[start:end],
                            g.ph_old_actions_probs: data.old_prob[start:end],
                            g.ph_advantages: data.adv[start:end],
                            g.ph_returns: data.ret[start:end],
                        })
                    del kl
                time_per_epoch = time.time() - epoch_starts

                log.debug(
                    'Feed dict gpu %r: took %.3f seconds per epoch (%d batches, %d samples)',
                    use_gpu,
                    time_per_epoch,
                    num_batches,
                    len(data.obs),
                )

        tf.reset_default_graph()
        gc.collect()
예제 #12
0
def main():
    args, params = parse_args_tmax(AgentTMAX.Params)
    env_id = args.env

    global key_to_action
    if 'dmlab' in env_id:
        from utils.envs.dmlab import play_dmlab
        key_to_action = play_dmlab.key_to_action
    elif 'atari' in env_id:
        key_to_action = atari_utils.key_to_action
    elif 'doom' in env_id:
        key_to_action = doom_utils.key_to_action
    else:
        raise Exception('Unknown env')

    try:
        show_map = args.show_automap
    except AttributeError:
        show_map = False

    # start keypress listener (to pause/resume execution or exit)
    def start_listener():
        with Listener(on_press=on_press, on_release=on_release) as listener:
            listener.join()

    listener_thread = Thread(target=start_listener)
    listener_thread.start()

    status = enjoy(params, args.env, show_automap=show_map)

    log.debug('Press ESC to exit...')
    listener_thread.join()

    return status
예제 #13
0
def set_gpus_for_process(process_idx,
                         num_gpus_per_process,
                         process_type,
                         gpu_mask=None):
    available_gpus = get_available_gpus()
    if gpu_mask is not None:
        assert len(available_gpus) >= len(available_gpus)
        available_gpus = [available_gpus[g] for g in gpu_mask]
    num_gpus = len(available_gpus)
    gpus_to_use = []

    if num_gpus == 0:
        os.environ[CUDA_ENVVAR] = ''
        log.debug('Not using GPUs for %s process %d', process_type,
                  process_idx)
    else:
        first_gpu_idx = process_idx * num_gpus_per_process
        for i in range(num_gpus_per_process):
            index_mod_num_gpus = (first_gpu_idx + i) % num_gpus
            gpus_to_use.append(available_gpus[index_mod_num_gpus])

        os.environ[CUDA_ENVVAR] = ','.join([str(g) for g in gpus_to_use])
        log.info(
            'Set environment var %s to %r for %s process %d',
            CUDA_ENVVAR,
            os.environ[CUDA_ENVVAR],
            process_type,
            process_idx,
        )
        log.debug('Visible devices: %r', torch.cuda.device_count())

    return gpus_to_use
예제 #14
0
    def add_trajectory_to_dense_map(self, existing_map, traj):
        t = Timing()

        m = existing_map
        m.new_episode()  # just in case

        node_idx = [-1] * len(
            traj)  # index map from trajectory frame to graph node idx
        node_idx[
            0] = 0  # first observation is always the same (we start from the same initial state)

        with t.timeit('create_initial_map'):
            self._add_simple_path_to_map(m, traj, node_idx)

        # precalculate feature vectors for the distance network
        with t.timeit('cache_feature_vectors'):
            all_observations = [
                m.get_observation(node) for node in m.graph.nodes
            ]
            obs_embeddings = self._calc_embeddings(all_observations)

        # with t.add_time('pairwise_distances'):
        #     pairwise_distances = self._calc_pairwise_distances(obs_embeddings)

        # TODO: so far no shortcuts
        # with t.timeit('loop_closures'):
        #     self._add_shortcuts(m, pairwise_distances)

        log.debug('Add trajectory to map, timing: %s', t)
        return m
예제 #15
0
def main():
    experiments_dir = '/home/alex/all/projects/sample-factory/train_dir'

    all_experiment_dirs_list = [join(experiments_dir, v['dir']) for k, v in EXPERIMENTS.items()]

    for experiment_dir in all_experiment_dirs_list:
        log.debug('Experiment dir: %s', experiment_dir)

    log.debug('Total: %d', len(all_experiment_dirs_list))

    for env, details in EXPERIMENTS.items():
        env_dir = details['dir']
        env_dir = join(experiments_dir, env_dir)
        event_files = Path(env_dir).rglob('*.tfevents.*')
        event_files = list(event_files)
        log.info('Event files: %r', event_files)

        env_dirs = set()
        for event_file in event_files:
            env_dirs.add(os.path.dirname(event_file))

        EXPERIMENTS[env]['dirs'] = sorted(list(env_dirs))
        log.info('Env dirs for env %s is %r', env, env_dirs)

    EXPERIMENT_GROUPS = (('dmlab30',),)

    for group_i, exp_group in enumerate(EXPERIMENT_GROUPS):
        fig, ax = plt.subplots(1, 1)
        ax = [ax]

        count = 0
        for env in exp_group:
            experiments = EXPERIMENTS[env]['dirs']
            aggregate(env, experiments, count, ax[count])
            count += 1

        # handles, labels = ax[-1].get_legend_handles_labels()
        # lgd = fig.legend(handles, labels, bbox_to_anchor=(0.1, 0.88, 0.8, 0.2), loc='lower left', ncol=4, mode="expand", prop={'size': 6})
        # lgd.set_in_layout(True)

        # zhehui
        # plt.show()
        # plot_name = f'{env}_{key.replace("/", " ")}'
        # plt.tight_layout()
        # plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=1, wspace=0)
        # plt.subplots_adjust(wspace=0.12, hspace=0.15)

        plt.tight_layout(rect=(0, 0, 1.0, 0.9))

        plt.margins(0, 0)
        plot_name = f'dmlab30'
        plt.savefig(
            os.path.join(os.getcwd(), f'../final_plots/reward_{plot_name}.pdf'),
            format='pdf',
            bbox_inches='tight',
            pad_inches=0,
        )
        # plt.savefig(os.path.join(os.getcwd(), f'../final_plots/reward_{plot_name}.pdf'), format='pdf', bbox_extra_artists=(lgd,))

    return 0
예제 #16
0
 def _broadcast_model_weights(self):
     state_dict = self.dqn.main.state_dict()
     policy_version = self.train_step
     log.debug('Broadcast model weights for model version %d', policy_version)
     model_state = (policy_version, state_dict)
     for q in self.policy_worker_queues:
         q.put((TaskType.INIT_MODEL, model_state))
예제 #17
0
    def all_accumulators_have_this_key(key):
        for scalar_accumulator in scalar_accumulators:
            if key not in scalar_accumulator.Keys():
                log.debug('Not all of the accumulators have key %s', key)
                return False

        return True
예제 #18
0
    def predict(self, imagined_action_lists):
        start = time.time()
        assert len(imagined_action_lists) == self.num_envs
        imagined_action_lists = np.split(np.array(imagined_action_lists),
                                         self.num_workers)
        for worker, imagined_action_list in zip(self.workers,
                                                imagined_action_lists):
            worker.task_queue.put(
                (imagined_action_list, MsgType.STEP_IMAGINED))

        observations = []
        rewards = []
        dones = []
        for worker in self.workers:
            worker.task_queue.join()
            results_per_worker = safe_get(
                worker.result_queue,
                timeout=1.0,
                msg=
                'Took a surprisingly long time to predict the future, retrying...',
            )

            assert len(results_per_worker) == len(imagined_action_lists[0])
            for result in results_per_worker:
                o, r, d, _ = zip(*result)
                observations.append(o)
                rewards.append(r)
                dones.append(d)
            worker.result_queue.task_done()

        if self._verbose:
            log.debug('Prediction step took %.4f s', time.time() - start)
        return observations, rewards, dones
예제 #19
0
def main():
    args, params = parse_args_tmax(AgentTMAX.Params)
    env_id = args.env

    global key_to_action
    if 'dmlab' in env_id:
        from utils.envs.dmlab import play_dmlab
        key_to_action = play_dmlab.key_to_action
    elif 'atari' in env_id:
        key_to_action = atari_utils.key_to_action
    elif 'doom' in env_id:
        key_to_action = doom_utils.key_to_action
    else:
        raise Exception('Unknown env')

    # start keypress listener (to pause/resume execution or exit)
    def start_listener():
        with Listener(on_press=on_press, on_release=on_release) as listener:
            listener.join()

    listener_thread = Thread(target=start_listener)
    listener_thread.start()

    status = build_graph(params, args.env)

    if not terminate:
        log.debug('Press ESC to exit...')
    listener_thread.join()

    return status
예제 #20
0
    def forward_pass(device_type):
        env_name = 'atari_breakout'
        cfg = default_cfg(algo='appooc', env=env_name)
        cfg.actor_critic_share_weights = True
        cfg.hidden_size = 128
        cfg.use_rnn = True
        cfg.env_framestack = 4

        env = create_env(env_name, cfg=cfg)

        torch.set_num_threads(1)
        torch.backends.cudnn.benchmark = True

        actor_critic = create_actor_critic(cfg, env.observation_space, env.action_space)
        device = torch.device(device_type)
        actor_critic.to(device)

        timing = Timing()
        with timing.timeit('all'):
            batch = 128
            with timing.add_time('input'):
                # better avoid hardcoding here...
                observations = dict(obs=torch.rand([batch, 4, 84, 84]).to(device))
                rnn_states = torch.rand([batch, get_hidden_size(cfg)]).to(device)

            n = 200
            for i in range(n):
                with timing.add_time('forward'):
                    output = actor_critic(observations, rnn_states)

                log.debug('Progress %d/%d', i, n)

        log.debug('Timing: %s', timing)
예제 #21
0
 def _save_cfg(self, policy_id):
     policy_cfg_filename = policy_cfg_file(self.cfg, policy_id)
     with open(policy_cfg_filename, 'w') as json_file:
         log.debug('Saving policy-specific configuration %d to file %s',
                   policy_id,
                   policy_cfg_filename)
         json.dump(self.policy_cfg[policy_id], json_file)
예제 #22
0
 def finish_initialization(self):
     """Wait until policy workers are fully initialized."""
     for policy_id, workers in self.policy_workers.items():
         for w in workers:
             log.debug('Waiting for policy worker %d-%d to finish initialization...', policy_id, w.worker_idx)
             w.init()
             log.debug('Policy worker %d-%d initialized!', policy_id, w.worker_idx)
예제 #23
0
    def __init__(self, cfg, obs_space, timing):
        super().__init__(cfg, timing)

        self.basic_encoder = create_standard_encoder(cfg, obs_space, timing)
        self.encoder_out_size = self.basic_encoder.encoder_out_size

        # same as IMPALA paper
        self.embedding_size = 20
        self.instructions_lstm_units = 64
        self.instructions_lstm_layers = 1

        padding_idx = 0
        self.word_embedding = nn.Embedding(
            num_embeddings=DMLAB_VOCABULARY_SIZE,
            embedding_dim=self.embedding_size,
            padding_idx=padding_idx)

        self.instructions_lstm = nn.LSTM(
            input_size=self.embedding_size,
            hidden_size=self.instructions_lstm_units,
            num_layers=self.instructions_lstm_layers,
            batch_first=True,
        )

        # learnable initial state?
        # initial_hidden_values = torch.normal(0, 1, size=(self.instructions_lstm_units, ))
        # self.lstm_h0 = nn.Parameter(initial_hidden_values, requires_grad=True)
        # self.lstm_c0 = nn.Parameter(initial_hidden_values, requires_grad=True)

        self.encoder_out_size += self.instructions_lstm_units
        log.debug('Policy head output size: %r', self.encoder_out_size)

        self.cpu_device = torch.device('cpu')
예제 #24
0
    def predict(self, imagined_action_lists):
        start = time.time()

        assert len(imagined_action_lists) == self.num_envs
        imagined_action_lists = np.split(np.array(imagined_action_lists),
                                         self.num_workers)
        for worker, imagined_action_list in zip(self.workers,
                                                imagined_action_lists):
            worker.step_queue.put((imagined_action_list, StepType.IMAGINED))

        observations = []
        rewards = []
        dones = []
        for worker in self.workers:
            worker.step_queue.join()
            results_per_worker = worker.result_queue.get()
            assert len(results_per_worker) == len(imagined_action_list)
            for result in results_per_worker:
                o, r, d, _ = zip(*result)
                observations.append(o)
                rewards.append(r)
                dones.append(d)

        if self._verbose:
            log.debug('Prediction step took %.4f s', time.time() - start)
        return observations, rewards, dones
예제 #25
0
    def _perturb_param(self, param, param_name, default_param):
        # toss a coin whether we perturb the parameter at all
        if random.random() > self.cfg.pbt_mutation_rate:
            return param

        if param != default_param and random.random() < 0.05:
            # small chance to replace parameter with a default value
            log.debug('%s changed to default value %r', param_name,
                      default_param)
            return default_param

        if param_name in SPECIAL_PERTURBATION:
            new_value = SPECIAL_PERTURBATION[param_name](param, self.cfg)
        elif type(param) is bool:
            new_value = not param
        elif isinstance(param, numbers.Number):
            perturb_amount = random.uniform(1.01, 1.5)
            new_value = perturb_float(float(param),
                                      perturb_amount=perturb_amount)
        else:
            raise RuntimeError('Unsupported parameter type')

        log.debug('Param %s changed from %.6f to %.6f', param_name, param,
                  new_value)
        return new_value
예제 #26
0
def find_available_port(start_port, increment=1000):
    port = start_port
    while port < 65535 and not is_udp_port_available(port):
        port += increment

    log.debug('Port %r is available', port)
    return port
예제 #27
0
    def _learner_update_cfg(self, policy_id):
        learner_worker = self.learner_workers[policy_id]

        log.debug('Sending learning configuration to learner %d...', policy_id)
        cfg_task = (PbtTask.UPDATE_CFG, (policy_id,
                                         self.policy_cfg[policy_id]))
        learner_worker.task_queue.put((TaskType.PBT, cfg_task))
예제 #28
0
    def step(self, actions):
        if self.skip_frames > 1 or self.num_agents == 1:
            # not used in multi-agent mode due to VizDoom limitations
            # this means that we have only one agent (+ maybe some bots, which is why we're in multiplayer mode)
            return super().step(actions)

        self._ensure_initialized()

        actions_binary = self._convert_actions(actions)

        self.game.set_action(actions_binary)
        self.game.advance_action(1, self.update_state)
        self.timestep += 1

        if not self.update_state:
            return None, None, None, None

        state = self.game.get_state()
        reward = self.game.get_last_reward()
        done = self.game.is_episode_finished()

        if self.record_to is not None:
            # send 'stop recording' command 1 tick before the end of the episode
            # otherwise it does not get saved to disk
            if self.game.get_episode_time(
            ) + 1 == self.game.get_episode_timeout():
                log.debug('Calling stop recording command!')
                self.game.send_game_command('stop')

        observation, done, info = self._process_game_step(state, done, {})
        return observation, reward, done, info
예제 #29
0
    def __init__(self, cfg, obs_space, timing):
        super().__init__(cfg, timing)

        obs_shape = get_obs_shape(obs_space)
        input_ch = obs_shape.obs[0]
        log.debug('Num input channels: %d', input_ch)

        if cfg.encoder_subtype == 'convnet_simple':
            conv_filters = [[input_ch, 32, 8, 4], [32, 64, 4, 2],
                            [64, 128, 3, 2]]
        elif cfg.encoder_subtype == 'convnet_impala':
            conv_filters = [[input_ch, 16, 8, 4], [16, 32, 4, 2]]
        elif cfg.encoder_subtype == 'minigrid_convnet_tiny':
            conv_filters = [[3, 16, 3, 1], [16, 32, 2, 1], [32, 64, 2, 1]]
        else:
            raise NotImplementedError(f'Unknown encoder {cfg.encoder_subtype}')

        activation = nonlinearity(self.cfg)
        fc_layer_size = fc_after_encoder_size(self.cfg)
        encoder_extra_fc_layers = self.cfg.encoder_extra_fc_layers

        enc = self.ConvEncoderImpl(activation, conv_filters, fc_layer_size,
                                   encoder_extra_fc_layers, obs_shape)
        self.enc = torch.jit.script(enc)

        self.encoder_out_size = calc_num_elements(self.enc, obs_shape.obs)
        log.debug('Encoder output size: %r', self.encoder_out_size)
예제 #30
0
def make_dmlab_env_impl(spec, cfg, env_config, **kwargs):
    skip_frames = cfg.env_frameskip

    gpu_idx = 0
    if len(cfg.dmlab_gpus) > 0:
        if kwargs.get('env_config') is not None:
            vector_index = kwargs['env_config']['vector_index']
            gpu_idx = cfg.dmlab_gpus[vector_index % len(cfg.dmlab_gpus)]
            log.debug('Using GPU %d for DMLab rendering!', gpu_idx)

    task_id = get_task_id(env_config, spec, cfg)
    level = task_id_to_level(task_id, spec)
    log.debug('%r level %s task id %d', env_config, level, task_id)

    env = DmlabGymEnv(
        task_id, level, skip_frames, cfg.res_w, cfg.res_h, cfg.dmlab_throughput_benchmark, cfg.dmlab_renderer,
        get_dataset_path(cfg), cfg.dmlab_with_instructions, cfg.dmlab_extended_action_set,
        cfg.dmlab_use_level_cache, cfg.dmlab_level_cache_path,
        gpu_idx, spec.extra_cfg,
    )

    if env_config and 'env_id' in env_config:
        env.seed(env_config['env_id'])

    if 'record_to' in cfg and cfg.record_to is not None:
        env = RecordingWrapper(env, cfg.record_to, 0)

    if cfg.pixel_format == 'CHW':
        env = PixelFormatChwWrapper(env)

    env = DmlabRewardShapingWrapper(env)
    return env