Exemplo n.º 1
0
            def __init__(self, initial_state, target):
                self.env = THORDiscreteEnvironment(
                    scene_name=scene_name,
                    initial_state_id = initial_state,
                    terminal_state_id = target,
                    h5_file_path=(lambda scene: parent.config["h5_file_path"].replace("{scene}", scene_name))
                )

                self.env.reset()
                self.net = net
    def run(self):
        scene_stats = dict()
        resultData = []
        for scene_scope, items in TASK_LIST.items():
            scene_net = self.scene_nets[scene_scope]
            scene_stats[scene_scope] = list()
            for task_scope in items:
                env = THORDiscreteEnvironment(
                    scene_name=scene_scope,
                    h5_file_path=(lambda scene: self.config.get("h5_file_path", "D:\\datasets\\visual_navigation_precomputed\\{scene}.h5").replace('{scene}', scene)),
                    terminal_state_id=int(task_scope)
                )

                ep_rewards = []
                ep_lengths = []
                ep_collisions = []
                for i_episode in range(NUM_EVAL_EPISODES):
                    env.reset()
                    terminal = False
                    ep_reward = 0
                    ep_collision = 0
                    ep_t = 0
                    while not terminal:
                        state = torch.Tensor(env.render(mode='resnet_features'))
                        target = torch.Tensor(env.render_target(mode='resnet_features'))
                        (policy, value,) = scene_net.forward(self.shared_net.forward((state, target,)))

                        with torch.no_grad():
                            action = F.softmax(policy, dim=0).multinomial(1).data.numpy()[0]
                        env.step(action)
                        terminal = env.is_terminal

                        if ep_t == 10000: break
                        if env.collided: ep_collision += 1
                        ep_reward += env.reward
                        ep_t += 1

                    ep_lengths.append(ep_t)
                    ep_rewards.append(ep_reward)
                    ep_collisions.append(ep_collision)
                    if VERBOSE: print("episode #{} ends after {} steps".format(i_episode, ep_t))

                print('evaluation: %s %s' % (scene_scope, task_scope))
                print('mean episode reward: %.2f' % np.mean(ep_rewards))
                print('mean episode length: %.2f' % np.mean(ep_lengths))
                print('mean episode collision: %.2f' % np.mean(ep_collisions))
                scene_stats[scene_scope].extend(ep_lengths)
                resultData.append((scene_scope, str(task_scope), np.mean(ep_rewards), np.mean(ep_lengths), np.mean(ep_collisions),))

        print('\nResults (average trajectory length):')
        for scene_scope in scene_stats:
            print('%s: %.2f steps'%(scene_scope, np.mean(scene_stats[scene_scope])))
        
        if 'csv_file' in self.config and self.config['csv_file'] is not None:
            export_to_csv(resultData, self.config['csv_file'])
Exemplo n.º 3
0
    def _initialize_thread(self):
        h5_file_path = self.init_args.get('h5_file_path')
        # self.logger = logging.getLogger('agent')
        # self.logger.setLevel(logging.INFO)
        self.init_args['h5_file_path'] = lambda scene: h5_file_path.replace('{scene}', scene)
        self.env = THORDiscreteEnvironment(self.scene, **self.init_args)
        self.gamma : float = self.init_args.get('gamma', 0.99)
        self.grad_norm: float = self.init_args.get('grad_norm', 40.0)
        entropy_beta : float = self.init_args.get('entropy_beta', 0.01)
        self.max_t : int = self.init_args.get('max_t', 1)# TODO: 5)
        self.local_t = 0
        self.action_space_size = self.get_action_space_size()

        self.criterion = ActorCriticLoss(entropy_beta)
        self.policy_network = nn.Sequential(SharedNetwork(), SceneSpecificNetwork(self.get_action_space_size()))

        # Initialize the episode
        self._reset_episode()
        self._sync_network()
Exemplo n.º 4
0
        class Agent:
            def __init__(self, initial_state, target):
                self.env = THORDiscreteEnvironment(
                    scene_name=scene_name,
                    initial_state_id = initial_state,
                    terminal_state_id = target,
                    h5_file_path=(lambda scene: parent.config["h5_file_path"].replace("{scene}", scene_name))
                )

                self.env.reset()
                self.net = net

            @staticmethod
            def get_parameters():
                return net.parameters()

            def act(self):
                with torch.no_grad():
                    state = torch.Tensor(self.env.render(mode='resnet_features')).to(parent.device)
                    target = torch.Tensor(self.env.render_target(mode='resnet_features')).to(parent.device)
                    (policy, value,) = net.forward((state, target,))
                    action = F.softmax(policy, dim=0).multinomial(1).cpu().data.numpy()[0]

                self.env.step(action)
                return (self.env.is_terminal, self.env.collided, self.env.reward)
class TrainingThread(mp.Process):
    def __init__(self, id: int, network: torch.nn.Module, saver, optimizer,
                 scene: str, **kwargs):

        super(TrainingThread, self).__init__()

        # Initialize the environment
        self.env = None
        self.init_args = kwargs
        self.scene = scene
        self.saver = saver
        self.local_backbone_network = SharedNetwork()
        self.id = id

        self.master_network = network
        self.optimizer = optimizer

    def _sync_network(self):
        self.policy_network.load_state_dict(self.master_network.state_dict())

    def _ensure_shared_grads(self):
        for param, shared_param in zip(self.policy_network.parameters(),
                                       self.master_network.parameters()):
            if shared_param.grad is not None:
                return
            shared_param._grad = param.grad

    def get_action_space_size(self):
        return len(self.env.actions)

    def _initialize_thread(self):
        h5_file_path = self.init_args.get('h5_file_path')
        # self.logger = logging.getLogger('agent')
        # self.logger.setLevel(logging.INFO)
        self.init_args['h5_file_path'] = lambda scene: h5_file_path.replace(
            '{scene}', scene)
        self.env = THORDiscreteEnvironment(self.scene, **self.init_args)
        self.gamma: float = self.init_args.get('gamma', 0.99)
        self.grad_norm: float = self.init_args.get('grad_norm', 40.0)
        entropy_beta: float = self.init_args.get('entropy_beta', 0.01)
        self.max_t: int = self.init_args.get('max_t', 1)  # TODO: 5)
        self.local_t = 0
        self.action_space_size = self.get_action_space_size()

        self.criterion = ActorCriticLoss(entropy_beta)
        self.policy_network = nn.Sequential(
            SharedNetwork(),
            SceneSpecificNetwork(self.get_action_space_size()))

        # Initialize the episode
        self._reset_episode()
        self._sync_network()

    def _reset_episode(self):
        self.episode_reward = 0
        self.episode_length = 0
        self.episode_max_q = -np.inf
        self.env.reset()

    def _forward_explore(self):
        # Does the evaluation end naturally?
        is_terminal = False
        terminal_end = False

        results = {"policy": [], "value": []}
        rollout_path = {"state": [], "action": [], "rewards": [], "done": []}

        # Plays out one game to end or max_t
        for t in range(self.max_t):
            state = {
                "current": self.env.render('resnet_features'),
                "goal": self.env.render_target('resnet_features'),
            }

            x_processed = torch.from_numpy(state["current"])
            goal_processed = torch.from_numpy(state["goal"])

            (policy, value) = self.policy_network((
                x_processed,
                goal_processed,
            ))

            # Store raw network output to use in backprop
            results["policy"].append(policy)
            results["value"].append(value)

            with torch.no_grad():
                (
                    _,
                    action,
                ) = policy.max(0)
                action = F.softmax(policy, dim=0).multinomial(1).item()

            policy = policy.data.numpy()
            value = value.data.numpy()

            # Makes the step in the environment
            self.env.step(action)

            # Receives the game reward
            is_terminal = self.env.is_terminal

            # ad-hoc reward for navigation
            reward = 10.0 if is_terminal else -0.01

            # Max episode length
            if self.episode_length > 5e3: is_terminal = True

            # Update episode stats
            self.episode_length += 1
            self.episode_reward += reward
            self.episode_max_q = max(self.episode_max_q, np.max(value))

            # clip reward
            reward = np.clip(reward, -1, 1)

            # Increase local time
            self.local_t += 1

            rollout_path["state"].append(state)
            rollout_path["action"].append(action)
            rollout_path["rewards"].append(reward)
            rollout_path["done"].append(is_terminal)

            if is_terminal:
                # TODO: add logging
                print('playout finished')
                print(f'episode length: {self.episode_length}')
                print(f'episode reward: {self.episode_reward}')
                print(f'episode max_q: {self.episode_max_q}')

                terminal_end = True
                self._reset_episode()
                break

        if terminal_end:
            return 0.0, results, rollout_path
        else:
            x_processed = torch.from_numpy(self.env.render('resnet_features'))
            goal_processed = torch.from_numpy(
                self.env.render_target('resnet_features'))

            (_, value) = self.policy_network((
                x_processed,
                goal_processed,
            ))
            return value.data.item(), results, rollout_path

    def _optimize_path(self, playout_reward: float, results, rollout_path):
        policy_batch = []
        value_batch = []
        action_batch = []
        temporary_difference_batch = []
        playout_reward_batch = []

        for i in reversed(range(len(results["value"]))):
            reward = rollout_path["rewards"][i]
            value = results["value"][i]
            action = rollout_path["action"][i]

            playout_reward = reward + self.gamma * playout_reward
            temporary_difference = playout_reward - value.data.item()

            policy_batch.append(results['policy'][i])
            value_batch.append(results['value'][i])
            action_batch.append(action)
            temporary_difference_batch.append(temporary_difference)
            playout_reward_batch.append(playout_reward)

        policy_batch = torch.stack(policy_batch, 0)
        value_batch = torch.stack(value_batch, 0)
        action_batch = torch.from_numpy(np.array(action_batch, dtype=np.int64))
        temporary_difference_batch = torch.from_numpy(
            np.array(temporary_difference_batch, dtype=np.float32))
        playout_reward_batch = torch.from_numpy(
            np.array(playout_reward_batch, dtype=np.float32))

        # Compute loss
        loss = self.criterion.forward(policy_batch, value_batch, action_batch,
                                      temporary_difference_batch,
                                      playout_reward_batch)
        loss = loss.sum()

        loss_value = loss.detach().numpy()
        self.optimizer.optimize(loss, self.policy_network.parameters(),
                                self.master_network.parameters())

    def run(self, master=None):
        signal.signal(signal.SIGINT, signal.SIG_IGN)
        print(f'Thread {self.id} ready')

        # We need to silence all errors on new process
        h5py._errors.silence_errors()
        self._initialize_thread()

        if not master is None:
            print(f'Master thread {self.id} started')
        else:
            print(f'Thread {self.id} started')

        try:
            self.env.reset()
            while True:
                self._sync_network()
                # Plays some samples
                playout_reward, results, rollout_path = self._forward_explore()
                # Train on collected samples
                self._optimize_path(playout_reward, results, rollout_path)

                print(f'Step finished {self.optimizer.get_global_step()}')

                # Trigger save or other
                self.saver.after_optimization()
                pass
        except Exception as e:
            print(e)
            # TODO: add logging
            #self.logger.error(e.msg)
            raise e
Exemplo n.º 6
0
    def run(self):
        scene_stats = dict()
        resultData = []
        for scene_scope, items in TASK_LIST.items():
            if len(self.config['test_scenes']) != 0 and not scene_scope in self.config['test_scenes']:
                continue

            scene_net = self.scene_nets[scene_scope]
            scene_stats[scene_scope] = list()
            for task_scope in items:
                env = THORDiscreteEnvironment(
                    scene_name=scene_scope,
                    h5_file_path=(lambda scene: self.config.get("h5_file_path", "D:\\datasets\\visual_navigation_precomputed\\{scene}.h5").replace('{scene}', scene)),
                    terminal_state_id=int(task_scope),
                )

                graph = env._get_graph_handle()
                hitting_times = graph['hitting_times'][()]
                shortest_paths = graph['shortest_path_distance'][()]

                ep_rewards = []
                ep_lengths = []
                ep_collisions = []
                ep_normalized_lengths = []
                for (i_episode, start) in enumerate(env.get_initial_states(int(task_scope))):
                    env.reset(initial_state_id = start)
                    terminal = False
                    ep_reward = 0
                    ep_collision = 0
                    ep_t = 0
                    hitting_time = hitting_times[start, int(task_scope)]
                    shortest_path = shortest_paths[start, int(task_scope)]

                    while not terminal:
                        state = torch.Tensor(env.render(mode='resnet_features'))
                        target = torch.Tensor(env.render_target(mode='resnet_features'))
                        (policy, value,) = scene_net.forward(self.shared_net.forward((state, target,)))

                        with torch.no_grad():
                            action = F.softmax(policy, dim=0).multinomial(1).data.numpy()[0]
                        env.step(action)
                        terminal = env.is_terminal

                        if ep_t == hitting_time: break
                        if env.collided: ep_collision += 1
                        ep_reward += env.reward
                        ep_t += 1                   


                    ep_lengths.append(ep_t)
                    ep_rewards.append(ep_reward)
                    ep_collisions.append(ep_collision)
                    ep_normalized_lengths.append(min(ep_t, hitting_time) / shortest_path)
                    if VERBOSE: print("episode #{} ends after {} steps".format(i_episode, ep_t))

                    
                print('evaluation: %s %s' % (scene_scope, task_scope))
                print('mean episode reward: %.2f' % np.mean(ep_rewards))
                print('mean episode length: %.2f' % np.mean(ep_lengths))
                print('mean episode collision: %.2f' % np.mean(ep_collisions))
                print('mean normalized episode length: %.2f' % np.mean(ep_normalized_lengths))
                scene_stats[scene_scope].extend(ep_lengths)
                resultData.append((scene_scope, str(task_scope), np.mean(ep_rewards), np.mean(ep_lengths), np.mean(ep_collisions), np.mean(ep_normalized_lengths),))

        print('\nResults (average trajectory length):')
        for scene_scope in scene_stats:
            print('%s: %.2f steps'%(scene_scope, np.mean(scene_stats[scene_scope])))
        
        if 'csv_file' in self.config and self.config['csv_file'] is not None:
            export_to_csv(resultData, self.config['csv_file'])
    return {key: torch.Tensor(v) for (key, v) in data.items()}


shared_net.load_state_dict(convertToStateDict(data['navigation']))
for key in TASK_LIST.keys():
    scene_nets[key].load_state_dict(
        convertToStateDict(data[f'navigation/{key}']))

scene_stats = dict()
for scene_scope, items in TASK_LIST.items():
    scene_net = scene_nets[scene_scope]
    scene_stats[scene_scope] = list()
    for task_scope in items:
        env = THORDiscreteEnvironment(
            scene_name=scene_scope,
            h5_file_path=(
                lambda scene:
                f"D:\\datasets\\visual_navigation_precomputed\\{scene}.h5"),
            terminal_state_id=int(task_scope))

        ep_rewards = []
        ep_lengths = []
        ep_collisions = []
        for i_episode in range(NUM_EVAL_EPISODES):
            env.reset()
            terminal = False
            ep_reward = 0
            ep_collision = 0
            ep_t = 0
            while not terminal:
                state = torch.Tensor(env.render(mode='resnet_features'))
                target = torch.Tensor(