示例#1
0
    def run_rnd_model(self, agent, trial):
        config = self._config
        trial = trial + config.shift

        step_limit = int(config.steps * 1e6)
        steps = 0

        steps_per_episode = []
        train_fm_errors = []
        train_ext_rewards = []
        train_int_rewards = []
        reward_avg = RunningAverageWindow(100)
        step_avg = RunningAverageWindow(100)

        bar = ProgressBar(config.steps * 1e6, max_width=40)
        exploration = GaussianExploration(
            config.sigma, 0.01, config.steps * config.exploration_time * 1e6)

        while steps < step_limit:
            state0 = torch.tensor(self._env.reset(),
                                  dtype=torch.float32).unsqueeze(0)
            done = False
            train_ext_reward = 0
            train_int_reward = 0
            train_steps = 0

            while not done:
                action0 = exploration.explore(agent.get_action(state0))
                next_state, reward, done, _ = self._env.step(
                    action0.squeeze(0).numpy())
                reward = self.transform_reward(reward)
                state1 = torch.tensor(next_state,
                                      dtype=torch.float32).unsqueeze(0)
                reward = torch.tensor([reward],
                                      dtype=torch.float32).unsqueeze(0)
                mask = torch.tensor([done], dtype=torch.float32).unsqueeze(0)

                agent.train(state0, action0, state1, reward, mask)
                train_steps += 1

                train_ext_reward += reward.item()
                train_int_reward += agent.motivation.reward(state0).item()
                train_fm_error = agent.motivation.error(state0).item()
                train_fm_errors.append(train_fm_error)

                state0 = state1

            steps += train_steps
            if steps > step_limit:
                train_steps -= steps - step_limit
            bar.numerator = steps
            exploration.update(steps)

            reward_avg.update(train_ext_reward)
            step_avg.update(train_steps)
            steps_per_episode.append(train_steps)
            train_ext_rewards.append(train_ext_reward)
            train_int_rewards.append(train_int_reward)

            print(
                'Run {0:d} step {1:d} sigma {2:f} training [ext. reward {3:f} int. reward {4:f} steps {5:d}] avg. ext. reward {6:f} avg. steps {7:f}'
                .format(trial, steps, exploration.sigma, train_ext_reward,
                        train_int_reward, train_steps, reward_avg.value(),
                        step_avg.value()))
            print(bar)

        agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name,
                                                     config.model, trial))

        print('Saving data...')
        save_data = {
            'steps': numpy.array(steps_per_episode),
            're': numpy.array(train_ext_rewards),
            'ri': numpy.array(train_int_rewards),
            'fme': numpy.array(train_fm_errors[:step_limit])
        }
        numpy.save(
            'ddpg_{0}_{1}_{2:d}'.format(config.name, config.model, trial),
            save_data)
示例#2
0
    def run_forward_inverse_model(self, agent, trial):
        config = self._config
        trial = trial + config.shift

        step_limit = int(config.steps * 1e6)
        steps = 0

        states = []

        steps_per_episode = []
        train_fm_errors = []
        train_im_errors = []
        train_ext_rewards = []
        train_int_rewards = []
        reward_avg = RunningAverageWindow(100)
        step_avg = RunningAverageWindow(100)

        bar = ProgressBar(config.steps * 1e6, max_width=40)
        exploration = GaussianExploration(
            config.sigma, 0.01, config.steps * config.exploration_time * 1e6)

        while steps < step_limit:
            state0 = torch.tensor(self._env.reset(),
                                  dtype=torch.float32).unsqueeze(0)
            done = False
            train_ext_reward = 0
            train_int_reward = 0
            train_steps = 0

            while not done:
                train_steps += 1
                states.append(state0.squeeze(0))
                action0 = exploration.explore(agent.get_action(state0))
                next_state, reward, done, _ = self._env.step(
                    action0.squeeze(0).numpy())
                reward = self.transform_reward(reward)
                state1 = torch.tensor(next_state,
                                      dtype=torch.float32).unsqueeze(0)

                agent.train(state0, action0, state1, reward, done)

                train_ext_reward += reward
                train_int_reward += agent.motivation.reward(
                    state0, action0, state1).item()
                train_fm_error, train_im_error = agent.motivation.error(
                    state0, action0, state1)
                train_fm_errors.append(train_fm_error.item())
                train_im_errors.append(train_im_error.item())

                state0 = state1

            steps += train_steps
            if steps > step_limit:
                train_steps -= steps - step_limit
            bar.numerator = steps
            exploration.update(steps)

            reward_avg.update(train_ext_reward)
            step_avg.update(train_steps)
            steps_per_episode.append(train_steps)
            train_ext_rewards.append(train_ext_reward)
            train_int_rewards.append(train_int_reward)

            print(
                'Run {0:d} step {1:d} sigma {2:f} training [ext. reward {3:f} int. reward {4:f} steps {5:d}] avg. ext. reward {6:f} avg. steps {7:f}'
                .format(trial, steps, exploration.sigma, train_ext_reward,
                        train_int_reward, train_steps, reward_avg.value(),
                        step_avg.value()))
            print(bar)

        agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name,
                                                     config.model, trial))

        print('Calculating distance matrices')
        states = self.generate_states(torch.stack(states[:step_limit]), 500)
        state_dist = cdist(states.flatten(1), states.flatten(1), 'euclidean')
        index_list = numpy.argsort(numpy.linalg.norm(state_dist, axis=1))
        states = states[index_list]
        state_dist = cdist(states.flatten(1), states.flatten(1), 'euclidean')
        latent_states = agent.network.encoder(states).detach()
        latent_dist = torch.cdist(latent_states, latent_states)

        print('Saving data...')
        save_data = {
            'steps': numpy.array(steps_per_episode),
            're': numpy.array(train_ext_rewards),
            'ri': numpy.array(train_int_rewards),
            'fme': numpy.array(train_fm_errors[:step_limit]),
            'ime': numpy.array(train_im_errors[:step_limit]),
            'sdm': state_dist,
            'ldm': latent_dist.numpy()
        }
        numpy.save(
            'ddpg_{0}_{1}_{2:d}'.format(config.name, config.model, trial),
            save_data)
示例#3
0
    def run_vae_forward_model(self, agent, trial):
        config = self._config
        trial = trial + config.shift
        forward_model = agent.get_motivation_module()
        vae = forward_model.get_fm_network()

        step_limit = int(config.steps * 1e6)
        steps = 0

        states = None
        if config.check('generate_states'):
            states = []
        if config.check('collect_stats'):
            states = torch.tensor(numpy.load('./{0:s}_states.npy'.format(
                self._env_name)),
                                  dtype=torch.float32)

        action_list = []
        value_list = []
        fm_error_list = []
        reward_list = []

        steps_per_episode = []
        train_fm_errors = []
        train_ext_rewards = []
        train_int_rewards = []
        train_vae_losses = []
        reward_avg = RunningAverageWindow(100)
        step_avg = RunningAverageWindow(100)

        bar = ProgressBar(config.steps * 1e6, max_width=40)
        exploration = GaussianExploration(
            config.sigma, 0.01, config.steps * config.exploration_time * 1e6)

        while steps < step_limit:
            if config.check('collect_stats'):
                actions, values, fm_errors, rewards = self.fm_activations(
                    self._env, agent, forward_model, states)
                action_list.append(actions)
                value_list.append(values)
                fm_error_list.append(fm_errors)
                reward_list.append(rewards)

            state0 = torch.tensor(self._env.reset(),
                                  dtype=torch.float32).unsqueeze(0)
            done = False
            train_ext_reward = 0
            train_int_reward = 0
            train_vae_loss = 0
            train_steps = 0

            while not done:
                train_steps += 1
                if config.check('generate_states'):
                    states.append(state0.numpy())
                action0 = exploration.explore(agent.get_action(state0))
                next_state, reward, done, _ = self._env.step(
                    action0.squeeze(0).numpy())
                reward = self.transform_reward(reward)
                state1 = torch.tensor(next_state,
                                      dtype=torch.float32).unsqueeze(0)

                agent.train(state0, action0, state1, reward, done)
                forward_model.train(state0, action0, state1)

                train_ext_reward += reward
                train_int_reward += forward_model.reward(
                    state0, action0, state1).item()
                train_vae_loss += vae.loss_function(state0, action0,
                                                    state1).item()
                train_fm_error = forward_model.error(state0, action0,
                                                     state1).item()
                train_fm_errors.append(train_fm_error)

                state0 = state1

            steps += train_steps
            if steps > step_limit:
                train_steps -= steps - step_limit
            bar.numerator = steps
            exploration.update(steps)

            reward_avg.update(train_ext_reward)
            step_avg.update(train_steps)
            steps_per_episode.append(train_steps)
            train_ext_rewards.append(train_ext_reward)
            train_int_rewards.append(train_int_reward)
            train_vae_losses.append(train_vae_loss)

            print(
                'Run {0} step {1:d} sigma {2:f} training [ext. reward {3:f} int. reward {4:f} VAE loss {5:f} steps {6:d}] avg. ext. reward {7:f} avg. steps {8:f}'
                .format(trial, steps, exploration.sigma, train_ext_reward,
                        train_int_reward, train_vae_loss, train_steps,
                        reward_avg.value(), step_avg.value()))
            print(bar)

        agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name,
                                                     config.model, trial))

        print('Saving data...')
        save_data = {
            'steps': numpy.array(steps_per_episode),
            're': numpy.array(train_ext_rewards),
            'ri': numpy.array(train_int_rewards),
            'fme': numpy.array(train_fm_errors[:step_limit]),
            'vl': numpy.array(train_vae_losses),
        }
        numpy.save(
            'ddpg_{0}_{1}_{2:d}'.format(config.name, config.model, trial),
            save_data)

        if config.check('generate_states'):
            self.generate_states(states)

        if config.check('collect_stats'):
            action_list = torch.stack(action_list)
            value_list = torch.stack(value_list)
            fm_error_list = torch.stack(fm_error_list)
            reward_list = torch.stack(reward_list)

            numpy.save(
                'ddpg_{0}_{1}_{2:d}_actions'.format(config.name, config.model,
                                                    trial), action_list)
            numpy.save(
                'ddpg_{0}_{1}_{2:d}_values'.format(config.name, config.model,
                                                   trial), value_list)
            numpy.save(
                'ddpg_{0}_{1}_{2:d}_prediction_errors'.format(
                    config.name, config.model, trial), fm_error_list)
            numpy.save(
                'ddpg_{0}_{1}_{2:d}_rewards'.format(config.name, config.model,
                                                    trial), reward_list)
示例#4
0
    def run_baseline(self, agent, trial):
        config = self._config
        n_env = config.n_env
        trial = trial + config.shift
        step_counter = StepCounter(int(config.steps * 1e6))

        steps_per_episode = []
        train_ext_rewards = []
        train_ext_reward = numpy.zeros((n_env, 1), dtype=numpy.float32)
        train_steps = numpy.zeros((n_env, 1), dtype=numpy.int32)
        reward_avg = RunningAverageWindow(100)
        # time_avg = RunningAverageWindow(100)

        s = numpy.zeros((n_env,) + self._env.observation_space.shape, dtype=numpy.float32)
        for i in range(n_env):
            s[i] = self._env.reset(i)

        state0 = self.process_state(s)

        while step_counter.running():
            with torch.no_grad():
                value, action0, probs0 = agent.get_action(state0)

            # start = time.time()
            next_state, reward, done, info = self._env.step(agent.convert_action(action0.cpu()))
            # end = time.time()
            # time_avg.update(end - start)
            # print('Duration {0:.3f}s'.format(time_avg.value()))

            train_steps += 1
            train_ext_reward += reward

            env_indices = numpy.nonzero(numpy.squeeze(done, axis=1))[0]

            for i in env_indices:
                if step_counter.steps + train_steps[i] > step_counter.limit:
                    train_steps[i] = step_counter.limit - step_counter.steps
                step_counter.update(train_steps[i].item())

                steps_per_episode.append(train_steps[i])
                train_ext_rewards.append(train_ext_reward[i])
                reward_avg.update(train_ext_reward[i].item())

                print('Run {0:d} step {1:d} training [ext. reward {2:f} steps {3:d} avg. reward {4:f}]'.format(trial, step_counter.steps, train_ext_reward[i].item(), train_steps[i].item(), reward_avg.value()))
                step_counter.print()

                train_ext_reward[i] = 0
                train_steps[i] = 0

                next_state[i] = self._env.reset(i)

            state1 = self.process_state(next_state)
            reward = torch.tensor(reward, dtype=torch.float32)
            done = 1 - torch.tensor(done, dtype=torch.float32)

            agent.train(state0, value, action0, probs0, state1, reward, done)

            state0 = self.process_state(s)

        agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name, config.model, trial))

        print('Saving data...')
        save_data = {
            'steps': numpy.array(steps_per_episode),
            're': numpy.array(train_ext_rewards)
        }
        numpy.save('ppo_{0}_{1}_{2:d}'.format(config.name, config.model, trial), save_data)
示例#5
0
    def run_forward_model(self, agent, trial):
        config = self._config
        n_env = config.n_env
        trial = trial + config.shift
        step_counter = StepCounter(int(config.steps * 1e6))
        reward_avg = RunningAverageWindow(100)
        time_avg = RunningAverageWindow(100)

        train_ext_rewards = {'raw': [], 'train': []}
        train_ext_reward = {'raw': [0] * n_env, 'train': [0] * n_env}
        train_int_rewards = []
        train_int_reward = [0] * n_env
        train_fm_errors = []
        train_fm_error = [[] for _ in range(n_env)]
        train_steps = [0] * n_env

        s = numpy.zeros((n_env,) + self._input_shape, dtype=numpy.float32)
        a = [None] * n_env
        ns = numpy.zeros((n_env,) + self._input_shape, dtype=numpy.float32)
        r = numpy.zeros((n_env, 1), dtype=numpy.float32)
        d = numpy.zeros((n_env, 1), dtype=numpy.float32)

        inputs = []
        for i in range(n_env):
            inputs.append(
                (i, trial, agent, a, train_ext_reward, train_int_reward, train_fm_error, train_steps, s, ns, r, d, step_counter, train_ext_rewards, train_int_rewards, train_fm_errors, reward_avg))

        for i in range(n_env):
            s[i] = self._env_list[i].reset()

        state0 = self.process_state(s)

        while step_counter.running():
            value, action0, probs0 = agent.get_action(state0)

            for i in range(n_env):
                a[i] = agent.convert_action(action0[i])

            # start = time.time()
            with ThreadPoolExecutor(max_workers=config.num_threads) as executor:
                executor.map(self.one_step_forward_model, inputs)

            # end = time.time()
            # time_avg.update(end - start)
            # print('Duration {0:.3f}s'.format(time_avg.value()))

            state1 = self.process_state(ns)

            fm_error = agent.motivation.error(state0, action0, state1)
            fm_reward = agent.motivation.reward(error=fm_error)

            for i in range(n_env):
                train_int_reward[i] += fm_reward[i].item()
                train_fm_error[i].append(fm_error[i].item())

            reward = torch.stack([torch.tensor(r, dtype=torch.float32), fm_reward.cpu()], dim=1).squeeze(-1)
            done = torch.tensor(d, dtype=torch.float32)

            agent.train(state0, value, action0, probs0, state1, reward, done)

            state0 = self.process_state(s)

        agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name, config.model, trial))

        print('Saving data...')
        save_data = {
            're': numpy.array(train_ext_rewards['train']),
            're_raw': numpy.array(train_ext_rewards['raw']),
            'ri': numpy.array(train_int_rewards),
            'fme': numpy.array(train_fm_errors[:step_counter.limit])
        }
        numpy.save('ppo_{0}_{1}_{2:d}'.format(config.name, config.model, trial), save_data)
    def run_baseline(self, agent, trial):
        config = self._config
        trial = trial + config.shift
        step_limit = int(config.steps * 1e6)
        steps = 0
        bar = ProgressBar(step_limit, max_width=80)

        steps_per_episode = []
        train_ext_rewards = []
        reward_avg = RunningAverageWindow(100)
        prob_avg = RunningAverageWindow(1000, 4)
        value_avg = RunningAverageWindow(1000)

        while steps < step_limit:
            state0 = self.process_state(self._env.reset())
            done = False
            train_ext_reward = 0
            train_steps = 0

            while not done:
                value, action0, probs0 = agent.get_action(state0)
                value_avg.update(value.numpy())
                prob_avg.update(probs0.numpy())
                next_state, reward, done, info = self._env.step(
                    agent.convert_action(action0.cpu()))

                if isinstance(reward, numpy.ndarray):
                    reward = reward[0]
                reward = torch.tensor([reward],
                                      dtype=torch.float32).unsqueeze(-1)

                state1 = self.process_state(next_state)
                mask = torch.tensor([1], dtype=torch.float32)
                if done:
                    mask[0] = 0
                mask = mask.unsqueeze(-1)
                agent.train(state0, value, action0, probs0, state1, reward,
                            mask)
                state0 = state1

                if info is not None and 'raw_score' in info:
                    train_ext_reward += info['raw_score']
                else:
                    train_ext_reward += reward.item()
                # train_ext_reward += reward
                train_steps += 1

            if steps + train_steps > step_limit:
                train_steps = step_limit - steps
            steps += train_steps
            bar.numerator = steps

            steps_per_episode.append(train_steps)
            train_ext_rewards.append(train_ext_reward)
            reward_avg.update(train_ext_reward)

            print(
                'Run {0:d} step {1:d} training [ext. reward {2:f} steps {3:d} mean reward {4:f}] prob {5:s} value {6:f}'
                .format(trial, steps, train_ext_reward, train_steps,
                        reward_avg.value().item(),
                        numpy.array2string(prob_avg.value()),
                        value_avg.value().item()))
            print(bar)

        agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name,
                                                     config.model, trial))

        print('Saving data...')
        save_data = {
            'steps': numpy.array(steps_per_episode),
            're': numpy.array(train_ext_rewards)
        }
        numpy.save(
            'ppo_{0}_{1}_{2:d}'.format(config.name, config.model, trial),
            save_data)
    def run_dop_model(self, agent, trial):
        config = self._config
        trial = trial + config.shift

        step_limit = int(config.steps * 1e6)
        steps = 0

        steps_per_episode = []
        train_fm_errors = []
        train_ext_rewards = []
        train_int_rewards = []
        train_head_index = []

        bar = ProgressBar(step_limit, max_width=80)
        reward_avg = RunningAverageWindow(100)

        while steps < step_limit:
            head_index_density = numpy.zeros(4)
            state0 = self.process_state(self._env.reset())
            done = False
            train_ext_reward = 0
            train_int_reward = 0
            train_error = 0
            train_steps = 0

            while not done:
                train_steps += 1
                value, action0, probs0 = agent.get_action(state0)
                agent.motivation.update_state_average(state0)
                action0, head_index = action0
                next_state, reward, done, info = self._env.step(
                    agent.convert_action(action0))
                state1 = self.process_state(next_state)
                reward = torch.tensor([reward],
                                      dtype=torch.float32).unsqueeze(0)
                mask = torch.tensor([1], dtype=torch.float32)
                if done:
                    mask[0] = 0

                agent.train(state0, value, action0, probs0, state1, reward,
                            mask)

                train_ext_reward += reward.item()
                train_int_reward += agent.motivation.reward(state0,
                                                            probs0).item()
                train_fm_error = agent.motivation.error(state0, probs0).item()
                train_error += train_fm_error
                train_fm_errors.append(train_fm_error)
                head_index_density[head_index.item()] += 1

                state0 = state1

            steps += train_steps
            if steps > step_limit:
                train_steps -= steps - step_limit
            bar.numerator = steps

            steps_per_episode.append(train_steps)
            train_ext_rewards.append(train_ext_reward)
            train_int_rewards.append(train_int_reward)
            train_head_index.append(head_index_density)
            reward_avg.update(train_ext_reward)

            print(
                'Run {0:d} step {1:d} training [ext. reward {2:f} error {3:f} steps {4:d} ({5:f} err/step) mean reward {6:f} density {7:s}]'
                .format(trial, steps, train_ext_reward, train_error,
                        train_steps, train_error / train_steps,
                        reward_avg.value(),
                        numpy.array2string(head_index_density)))
            print(bar)

        agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name,
                                                     config.model, trial))

        print('Saving data...')
        save_data = {
            'steps': numpy.array(steps_per_episode),
            're': numpy.array(train_ext_rewards),
            'ri': numpy.array(train_int_rewards),
            'fme': numpy.array(train_fm_errors[:step_limit]),
            'hid': numpy.stack(train_head_index)
        }
        numpy.save(
            'ppo_{0}_{1}_{2:d}'.format(config.name, config.model, trial),
            save_data)
    def run_rnd_model(self, agent, trial):
        config = self._config
        trial = trial + config.shift

        step_limit = int(config.steps * 1e6)
        steps = 0

        steps_per_episode = []
        train_fm_errors = []
        train_ext_rewards = []
        train_int_rewards = []

        bar = ProgressBar(step_limit, max_width=80)
        reward_avg = RunningAverageWindow(100)

        while steps < step_limit:
            state0 = self.process_state(self._env.reset())
            done = False
            train_ext_reward = 0
            train_int_reward = 0
            train_steps = 0

            while not done:
                train_steps += 1
                value, action0, probs0 = agent.get_action(state0)
                agent.motivation.update_state_average(state0)
                next_state, reward, done, info = self._env.step(
                    agent.convert_action(action0.cpu()))
                state1 = self.process_state(next_state)
                ext_reward = torch.tensor([reward],
                                          dtype=torch.float32).unsqueeze(0)
                int_reward = agent.motivation.reward(state0).cpu()
                reward = torch.cat([ext_reward, int_reward], dim=1)
                mask = torch.tensor([1], dtype=torch.float32)
                if done:
                    mask[0] = 0
                mask = mask.unsqueeze(-1)

                agent.train(state0, value, action0, probs0, state1, reward,
                            mask)

                train_ext_reward += ext_reward.item()
                train_int_reward += int_reward.item()
                train_fm_error = agent.motivation.error(state0).item()
                train_fm_errors.append(train_fm_error)

                state0 = state1

            steps += train_steps
            if steps > step_limit:
                train_steps -= steps - step_limit
            bar.numerator = steps

            steps_per_episode.append(train_steps)
            train_ext_rewards.append(train_ext_reward)
            train_int_rewards.append(train_int_reward)
            reward_avg.update(train_ext_reward)

            print(
                'Run {0:d} step {1:d} training [ext. reward {2:f} int. reward {3:f} steps {4:d} ({5:f})  mean reward {6:f}]'
                .format(trial, steps, train_ext_reward, train_int_reward,
                        train_steps, train_int_reward / train_steps,
                        reward_avg.value()))
            print(bar)

        agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name,
                                                     config.model, trial))

        print('Saving data...')
        save_data = {
            'steps': numpy.array(steps_per_episode),
            're': numpy.array(train_ext_rewards),
            'ri': numpy.array(train_int_rewards),
            'fme': numpy.array(train_fm_errors[:step_limit])
        }
        numpy.save(
            'ppo_{0}_{1}_{2:d}'.format(config.name, config.model, trial),
            save_data)