예제 #1
0
파일: dqn_actor.py 프로젝트: xlnwel/d2rl
        def _learning(self, actor):
            while not self.dataset.good_to_learn():
                time.sleep(1)
            pwc('Learner starts learning...', color='blue')

            to_log = Every(self.LOG_PERIOD, self.LOG_PERIOD)
            train_step = 0
            start_time = time.time()
            start_train_step = train_step
            start_env_step = self._env_step
            while True:
                self.learn_log(train_step)
                train_step += self.N_UPDATES
                if train_step % self.SYNC_PERIOD == 0:
                    self.distribute_weights(actor)
                if to_log(train_step):
                    duration = time.time() - start_time
                    self.store(
                        fps=(self._env_step - start_env_step) / duration,
                        tps=(train_step - start_train_step)/duration)
                    start_env_step = self._env_step
                    self.log(self._env_step)
                    self.save(self._env_step, print_terminal_info=False)
                    start_train_step = train_step
                    start_time = time.time()
예제 #2
0
파일: train.py 프로젝트: xlnwel/d2rl
def train(agent, env, eval_env, replay):
    collect_fn = pkg.import_module('agent', algo=agent.name).collect
    collect = functools.partial(collect_fn, replay)

    _, step = replay.count_episodes()
    step = max(agent.env_step, step)

    runner = Runner(env, agent, step=step)

    def random_actor(*args, **kwargs):
        prev_action = random_actor.prev_action
        random_actor.prev_action = action = env.random_action()
        return action, {'prev_action': prev_action}
    random_actor.prev_action = np.zeros_like(env.random_action()) \
        if isinstance(env.random_action(), np.ndarray) else 0
    while not replay.good_to_learn():
        step = runner.run(action_selector=random_actor, step_fn=collect)

    to_log = Every(agent.LOG_PERIOD)
    to_eval = Every(agent.EVAL_PERIOD)
    print('Training starts...')
    while step < int(agent.MAX_STEPS):
        start_step = step
        start_t = time.time()
        agent.learn_log(step)
        step = runner.run(step_fn=collect, nsteps=agent.TRAIN_PERIOD)
        duration = time.time() - start_t
        agent.store(fps=(step - start_step) / duration,
                    tps=agent.N_UPDATES / duration)

        if to_eval(step):
            with TempStore(agent.get_states, agent.reset_states):
                score, epslen, video = evaluate(eval_env,
                                                agent,
                                                record=agent.RECORD,
                                                size=(64, 64))
                if agent.RECORD:
                    video_summary(f'{agent.name}/sim', video, step=step)
                agent.store(eval_score=score, eval_epslen=epslen)

        if to_log(step):
            agent.log(step)
            agent.save()
예제 #3
0
    def _add_attributes(self, env, dataset):
        """ Adds attributes to Agent """
        self._sample_timer = Timer('sample')
        self._learn_timer = Timer('train')

        self._return_stats = getattr(self, '_return_stats', False)

        self.RECORD = getattr(self, 'RECORD', False)
        self.N_EVAL_EPISODES = getattr(self, 'N_EVAL_EPISODES', 1)

        # intervals between calling self._summary
        self._to_summary = Every(self.LOG_PERIOD, self.LOG_PERIOD)
예제 #4
0
        def __init__(self, actor_id, model_fn, config, model_config,
                     env_config):
            config_actor('Actor', config)

            self._id = actor_id

            self._n_envvecs = env_config['n_envvecs']
            self._n_envs = env_config['n_envs']
            env = create_env(env_config)

            models = model_fn(model_config, env)

            super().__init__(name=f'Actor_{actor_id}',
                             config=config,
                             models=models,
                             dataset=None,
                             env=env)

            # number of workers per actor
            self._wpa = self._n_workers // self._n_actors

            self._action_batch = int(self._n_workers * self._n_envvecs *
                                     self._action_frac)
            if 'act_eps' in config:
                act_eps = compute_act_eps(config['act_eps_type'],
                                          config['act_eps'], None,
                                          config['n_workers'],
                                          self._n_envvecs * self._n_envs)
                self._act_eps_mapping = act_eps.reshape(
                    config['n_workers'], self._n_envvecs, self._n_envs)
                print(self.name, self._act_eps_mapping)
            else:
                self._act_eps_mapping = None

            # agent's state
            if 'rnn' in self.model:
                self._state_mapping = collections.defaultdict(
                    lambda: self.model.get_initial_state(batch_size=env.n_envs,
                                                         dtype=self._dtype))
                self._prev_action_mapping = collections.defaultdict(
                    lambda: tf.zeros(
                        (env.n_envs, *self._action_shape), self._dtype))

            if not hasattr(self, '_pull_names'):
                self._pull_names = [
                    k for k in self.model.keys() if 'target' not in k
                ]

            self._to_sync = Every(self.SYNC_PERIOD) if getattr(
                self, 'SYNC_PERIOD') else None
예제 #5
0
파일: actor.py 프로젝트: xlnwel/d2rl
        def run(self, learner, monitor):
            step = 0
            if getattr(self, 'RECORD_PERIOD', False):
                # how often to record videos
                to_record = Every(self.RECORD_PERIOD)
            else:
                to_record = lambda x: False

            while True:
                step += 1
                weights = self._pull_weights(learner)
                self.model.set_weights(weights)
                self._run(record=to_record(step))
                self._send_episodic_info(monitor)
예제 #6
0
 def _setup_target_net_sync(self):
     self._to_sync = Every(self._target_update_period) \
         if hasattr(self, '_target_update_period') else None
예제 #7
0
 def _add_attributes(self, env, dataset):
     super()._add_attributes(env, dataset)
     self._to_log_images = Every(self.LOG_PERIOD)
     self._setup_memory_state_record()
예제 #8
0
def train(agent, env, eval_env, buffer):
    collect_fn = pkg.import_module('agent', algo=agent.name).collect
    collect = functools.partial(collect_fn, buffer)

    step = agent.env_step
    runner = Runner(env, agent, step=step, nsteps=agent.N_STEPS)
    exp_buffer = get_expert_data(f'{buffer.DATA_PATH}-{env.name}')

    if step == 0 and agent.is_obs_normalized:
        print('Start to initialize running stats...')
        for _ in range(10):
            runner.run(action_selector=env.random_action, step_fn=collect)
            agent.update_obs_rms(np.concatenate(buffer['obs']))
            agent.update_reward_rms(buffer['reward'], buffer['discount'])
            buffer.reset()
        buffer.clear()
        agent.save(print_terminal_info=True)

    runner.step = step
    # print("Initial running stats:", *[f'{k:.4g}' for k in agent.get_running_stats() if k])
    to_log = Every(agent.LOG_PERIOD, agent.LOG_PERIOD)
    to_eval = Every(agent.EVAL_PERIOD)
    rt = Timer('run')
    tt = Timer('train')
    et = Timer('eval')
    lt = Timer('log')
    print('Training starts...')
    while step < agent.MAX_STEPS:
        start_env_step = agent.env_step
        agent.before_run(env)
        with rt:
            step = runner.run(step_fn=collect)
        agent.store(fps=(step - start_env_step) / rt.last())
        buffer.reshape_to_sample()
        agent.disc_learn_log(exp_buffer)
        buffer.compute_reward_with_func(agent.compute_reward)
        buffer.reshape_to_store()

        # NOTE: normalizing rewards here may introduce some inconsistency
        # if normalized rewards is fed as an input to the network.
        # One can reconcile this by moving normalization to collect
        # or feeding the network with unnormalized rewards.
        # The latter is adopted in our implementation.
        # However, the following line currently doesn't store
        # a copy of unnormalized rewards
        agent.update_reward_rms(buffer['reward'], buffer['discount'])
        buffer.update('reward',
                      agent.normalize_reward(buffer['reward']),
                      field='all')
        agent.record_last_env_output(runner.env_output)
        value = agent.compute_value()
        buffer.finish(value)

        start_train_step = agent.train_step
        with tt:
            agent.learn_log(step)
        agent.store(tps=(agent.train_step - start_train_step) / tt.last())
        buffer.reset()

        if to_eval(agent.train_step) or step > agent.MAX_STEPS:
            with TempStore(agent.get_states, agent.reset_states):
                with et:
                    eval_score, eval_epslen, video = evaluate(
                        eval_env,
                        agent,
                        n=agent.N_EVAL_EPISODES,
                        record=agent.RECORD,
                        size=(64, 64))
                if agent.RECORD:
                    video_summary(f'{agent.name}/sim', video, step=step)
                agent.store(eval_score=eval_score, eval_epslen=eval_epslen)

        if to_log(agent.train_step) and agent.contains_stats('score'):
            with lt:
                agent.store(
                    **{
                        'train_step': agent.train_step,
                        'time/run': rt.total(),
                        'time/train': tt.total(),
                        'time/eval': et.total(),
                        'time/log': lt.total(),
                        'time/run_mean': rt.average(),
                        'time/train_mean': tt.average(),
                        'time/eval_mean': et.average(),
                        'time/log_mean': lt.average(),
                    })
                agent.log(step)
                agent.save()
예제 #9
0
파일: train.py 프로젝트: xlnwel/d2rl
def train(agent, env, eval_env, buffer):
    def collect(env, step, reset, next_obs, **kwargs):
        buffer.add(**kwargs)

    step = agent.env_step
    runner = Runner(env, agent, step=step, nsteps=agent.N_STEPS)
    actsel = lambda *args, **kwargs: np.random.randint(
        0, env.action_dim, size=env.n_envs)
    if not agent.rnd_rms_restored():
        print('Start to initialize observation running stats...')
        for _ in range(50):
            runner.run(action_selector=actsel, step_fn=collect)
            agent.update_obs_rms(buffer['obs'])
            buffer.reset()
        buffer.clear()
        agent.save()
        runner.step = step

    to_log = Every(agent.LOG_PERIOD, agent.LOG_PERIOD)
    to_eval = Every(agent.EVAL_PERIOD)
    print('Training starts...')
    while step < agent.MAX_STEPS:
        start_env_step = agent.env_step
        with Timer('env') as rt:
            step = runner.run(step_fn=collect)
        agent.store(fps=(step - start_env_step) / rt.last())

        agent.record_last_env_output(runner.env_output)
        value_int, value_ext = agent.compute_value()
        obs = buffer.get_obs(runner.env_output.obs)
        assert obs.shape[:2] == (env.n_envs, agent.N_STEPS + 1)
        assert obs.dtype == np.uint8
        agent.update_obs_rms(obs[:, :-1])
        norm_obs = agent.normalize_obs(obs)
        # compute intrinsic reward from the next normalized obs
        reward_int = agent.compute_int_reward(norm_obs[:, 1:])
        agent.update_int_reward_rms(reward_int)
        reward_int = agent.normalize_int_reward(reward_int)
        buffer.finish(reward_int, norm_obs[:, :-1], value_int, value_ext)
        agent.store(
            reward_int_max=np.max(reward_int),
            reward_int_min=np.min(reward_int),
            reward_int=np.mean(reward_int),
            reward_int_std=np.std(reward_int),
        )

        start_train_step = agent.train_step
        with Timer('train') as tt:
            agent.learn_log(step)
        agent.store(tps=(agent.train_step - start_train_step) / tt.last())
        buffer.reset()

        if to_eval(agent.train_step):
            with TempStore(agent.get_states, agent.reset_states):
                scores, epslens, video = evaluate(eval_env,
                                                  agent,
                                                  record=True,
                                                  video_len=4500)
                video_summary(f'{agent.name}/sim', video, step=step)
                if eval_env.n_envs == 1:
                    rews_int, rews_ext = agent.retrieve_eval_rewards()
                    assert len(rews_ext) == len(rews_int) == video.shape[1], (
                        len(rews_ext), len(rews_int), video.shape[1])
                    n = 10
                    idxes_int = rews_int.argsort()[::-1][:n]
                    idxes_ext = rews_ext.argsort()[::-1][:n]
                    assert idxes_int.shape == idxes_ext.shape, (
                        idxes_int.shape, idxes_ext.shape)

                    imgs_int = video[0, idxes_int]
                    imgs_ext = video[0, idxes_ext]
                    rews_int = rews_int[idxes_int]
                    rews_ext = rews_ext[idxes_ext]
                    terms = {
                        **{
                            f'eval/reward_int_{i}': rews_int[i]
                            for i in range(0, n)
                        },
                        **{
                            f'eval/reward_ext_{i}': rews_ext[i]
                            for i in range(0, n)
                        },
                    }
                    agent.store(**terms)
                    imgs = np.concatenate([imgs_int[:n], imgs_ext[:n]], 0)
                    image_summary(f'{agent.name}/img', imgs, step=step)

                    # info = eval_env.info()[0]
                    # episode = info.get('episode', {'visited_rooms': 1})
                    # agent.store(visited_rooms_max=len(episode['visited_rooms']))
                    agent.histogram_summary(
                        {'eval/action': agent.retrieve_eval_actions()},
                        step=step)
                agent.store(eval_score=scores, eval_epslen=epslens)

        if to_log(agent.train_step) and agent.contains_stats('score'):
            agent.store(
                **{
                    'episodes': runner.episodes,
                    'train_step': agent.train_step,
                    'time/run': rt.total(),
                    'time/train': tt.total()
                })
            agent.log(step)
            agent.save()
예제 #10
0
파일: train.py 프로젝트: xlnwel/d2rl
def train(agent, env, eval_env, replay):
    collect_fn = pkg.import_module('agent', algo=agent.name).collect
    collect = functools.partial(collect_fn, replay)

    em = pkg.import_module(env.name.split("_")[0], pkg='env')
    info_func = em.info_func if hasattr(em, 'info_func') else None

    env_step = agent.env_step
    runner = Runner(env,
                    agent,
                    step=env_step,
                    run_mode=RunMode.TRAJ,
                    info_func=info_func)
    agent.TRAIN_PERIOD = env.max_episode_steps
    while not replay.good_to_learn():
        env_step = runner.run(step_fn=collect)
        replay.finish_episodes()

    to_eval = Every(agent.EVAL_PERIOD)
    to_log = Every(agent.LOG_PERIOD, agent.LOG_PERIOD)
    to_eval = Every(agent.EVAL_PERIOD)
    to_record = Every(agent.EVAL_PERIOD * 10)
    rt = Timer('run')
    tt = Timer('train')
    # et = Timer('eval')
    lt = Timer('log')
    print('Training starts...')
    while env_step <= int(agent.MAX_STEPS):
        with rt:
            env_step = runner.run(step_fn=collect)
        replay.finish_episodes()
        assert np.all(runner.env_output.reset), \
            (runner.env_output.reset, env.info().get('score', 0), env.info().get('epslen', 0))
        with tt:
            agent.learn_log(env_step)

        # if to_eval(env_step):
        #     with TempStore(agent.get_states, agent.reset_states):
        #         with et:
        #             record = agent.RECORD and to_record(env_step)
        #             eval_score, eval_epslen, video = evaluate(
        #                 eval_env, agent, n=agent.N_EVAL_EPISODES,
        #                 record=agent.RECORD, size=(64, 64))
        #             if record:
        #                 video_summary(f'{agent.name}/sim', video, step=env_step)
        #             agent.store(
        #                 eval_score=eval_score,
        #                 eval_epslen=eval_epslen)

        if to_log(env_step):
            with lt:
                fps = rt.average() * agent.TRAIN_PERIOD
                tps = tt.average() * agent.N_UPDATES

                agent.store(
                    env_step=agent.env_step,
                    train_step=agent.train_step,
                    fps=fps,
                    tps=tps,
                )
                agent.store(
                    **{
                        'train_step': agent.train_step,
                        'time/run': rt.total(),
                        'time/train': tt.total(),
                        # 'time/eval': et.total(),
                        'time/log': lt.total(),
                        'time/run_mean': rt.average(),
                        'time/train_mean': tt.average(),
                        # 'time/eval_mean': et.average(),
                        'time/log_mean': lt.average(),
                    })
                agent.log(env_step)
                agent.save()
예제 #11
0
파일: train.py 프로젝트: xlnwel/d2rl
def train(agent, env, eval_env, replay):
    collect_fn = pkg.import_module('agent', algo=agent.name).collect
    collect = functools.partial(collect_fn, replay)

    env_step = agent.env_step
    runner = Runner(env, agent, step=env_step, nsteps=agent.TRAIN_PERIOD)
    while not replay.good_to_learn():
        env_step = runner.run(
            # NOTE: random action below makes a huge difference for Mujoco tasks
            # by default, we don't use it as it's not a conventional practice.
            # action_selector=env.random_action,
            step_fn=collect)

    to_eval = Every(agent.EVAL_PERIOD)
    to_log = Every(agent.LOG_PERIOD, agent.LOG_PERIOD)
    to_eval = Every(agent.EVAL_PERIOD)
    to_record = Every(agent.EVAL_PERIOD * 10)
    rt = Timer('run')
    tt = Timer('train')
    et = Timer('eval')
    lt = Timer('log')
    print('Training starts...')
    while env_step <= int(agent.MAX_STEPS):
        with rt:
            env_step = runner.run(step_fn=collect)
        with tt:
            agent.learn_log(env_step)

        if to_eval(env_step):
            with TempStore(agent.get_states, agent.reset_states):
                with et:
                    record = agent.RECORD and to_record(env_step)
                    eval_score, eval_epslen, video = evaluate(
                        eval_env,
                        agent,
                        n=agent.N_EVAL_EPISODES,
                        record=agent.RECORD,
                        size=(64, 64))
                    if record:
                        video_summary(f'{agent.name}/sim',
                                      video,
                                      step=env_step)
                    agent.store(eval_score=eval_score, eval_epslen=eval_epslen)

        if to_log(env_step):
            with lt:
                fps = rt.average() * agent.TRAIN_PERIOD
                tps = tt.average() * agent.N_UPDATES

                agent.store(
                    env_step=agent.env_step,
                    train_step=agent.train_step,
                    fps=fps,
                    tps=tps,
                )
                agent.store(
                    **{
                        'train_step': agent.train_step,
                        'time/run': rt.total(),
                        'time/train': tt.total(),
                        'time/eval': et.total(),
                        'time/log': lt.total(),
                        'time/run_mean': rt.average(),
                        'time/train_mean': tt.average(),
                        'time/eval_mean': et.average(),
                        'time/log_mean': lt.average(),
                    })
                agent.log(env_step)
                agent.save()
예제 #12
0
파일: agent.py 프로젝트: xlnwel/d2rl
 def _add_attributes(self, env, dataset):
     super()._add_attributes(env, dataset)
     self._to_summary_value = Every(self.LOG_PERIOD, self.LOG_PERIOD)