示例#1
0
    def test_simple_distribution(self):
        simple_action_space = gym.spaces.Discrete(3)
        simple_num_logits = calc_num_logits(simple_action_space)
        self.assertEqual(simple_num_logits, simple_action_space.n)

        simple_logits = torch.rand(self.batch_size, simple_num_logits)
        simple_action_distribution = get_action_distribution(simple_action_space, simple_logits)

        simple_actions = simple_action_distribution.sample()
        self.assertEqual(list(simple_actions.shape), [self.batch_size])
        self.assertTrue(all(0 <= a < simple_action_space.n for a in simple_actions))
示例#2
0
    def test_tuple_sanity_check(self):
        num_spaces, num_actions = 3, 2
        simple_space = gym.spaces.Discrete(num_actions)
        spaces = [simple_space for _ in range(num_spaces)]
        tuple_space = gym.spaces.Tuple(spaces)

        self.assertTrue(calc_num_logits(tuple_space), num_spaces * num_actions)

        simple_logits = torch.zeros(1, num_actions)
        tuple_logits = torch.zeros(1, calc_num_logits(tuple_space))

        simple_distr = get_action_distribution(simple_space, simple_logits)
        tuple_distr = get_action_distribution(tuple_space, tuple_logits)

        tuple_entropy = tuple_distr.entropy()
        self.assertEqual(tuple_entropy, simple_distr.entropy() * num_spaces)

        simple_logprob = simple_distr.log_prob(torch.ones(1))
        tuple_logprob = tuple_distr.log_prob(torch.ones(1, num_spaces))
        self.assertEqual(tuple_logprob, simple_logprob * num_spaces)
示例#3
0
    def test_gumbel_trick(self):
        """
        We use a Gumbel noise which seems to be faster compared to using pytorch multinomial.
        Here we test that those are actually equivalent.
        """

        timing = Timing()

        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = True

        with torch.no_grad():
            action_space = gym.spaces.Discrete(8)
            num_logits = calc_num_logits(action_space)
            device_type = 'cpu'
            device = torch.device(device_type)
            logits = torch.rand(self.batch_size, num_logits, device=device) * 10.0 - 5.0

            if device_type == 'cuda':
                torch.cuda.synchronize(device)

            count_gumbel, count_multinomial = np.zeros([action_space.n]), np.zeros([action_space.n])

            # estimate probability mass by actually sampling both ways
            num_samples = 20000

            action_distribution = get_action_distribution(action_space, logits)
            sample_actions_log_probs(action_distribution)
            action_distribution.sample_gumbel()

            with timing.add_time('gumbel'):
                for i in range(num_samples):
                    action_distribution = get_action_distribution(action_space, logits)
                    samples_gumbel = action_distribution.sample_gumbel()
                    count_gumbel[samples_gumbel[0]] += 1

            action_distribution = get_action_distribution(action_space, logits)
            action_distribution.sample()

            with timing.add_time('multinomial'):
                for i in range(num_samples):
                    action_distribution = get_action_distribution(action_space, logits)
                    samples_multinomial = action_distribution.sample()
                    count_multinomial[samples_multinomial[0]] += 1

            estimated_probs_gumbel = count_gumbel / float(num_samples)
            estimated_probs_multinomial = count_multinomial / float(num_samples)

            log.debug('Gumbel estimated probs: %r', estimated_probs_gumbel)
            log.debug('Multinomial estimated probs: %r', estimated_probs_multinomial)
            log.debug('Sampling timing: %s', timing)
            time.sleep(0.1)  # to finish logging
示例#4
0
    def __init__(self, cfg, core_out_size, action_space):
        super().__init__(cfg, action_space)

        assert not cfg.adaptive_stddev
        assert is_continuous_action_space(self.action_space), \
            'Non-adaptive stddev makes sense only for continuous action spaces'

        num_action_outputs = calc_num_logits(action_space)

        # calculate only action means using the policy neural network
        self.distribution_linear = nn.Linear(core_out_size,
                                             num_action_outputs // 2)

        # stddev is a single learned parameter
        initial_stddev = torch.empty([num_action_outputs // 2])
        initial_stddev.fill_(math.log(self.cfg.initial_stddev))
        self.learned_stddev = nn.Parameter(initial_stddev, requires_grad=True)
示例#5
0
    def test_tuple_distribution(self):
        num_spaces = random.randint(1, 4)
        spaces = [gym.spaces.Discrete(random.randint(2, 5)) for _ in range(num_spaces)]
        action_space = gym.spaces.Tuple(spaces)

        num_logits = calc_num_logits(action_space)
        logits = torch.rand(self.batch_size, num_logits)

        self.assertEqual(num_logits, sum(s.n for s in action_space.spaces))

        action_distribution = get_action_distribution(action_space, logits)

        tuple_actions = action_distribution.sample()
        self.assertEqual(list(tuple_actions.shape), [self.batch_size, num_spaces])

        log_probs = action_distribution.log_prob(tuple_actions)
        self.assertEqual(list(log_probs.shape), [self.batch_size])

        entropy = action_distribution.entropy()
        self.assertEqual(list(entropy.shape), [self.batch_size])
示例#6
0
    def __init__(self, cfg, action_space):
        super().__init__()
        self.k: int = cfg.cpc_forward_steps
        self.time_subsample: int = cfg.cpc_time_subsample
        self.forward_subsample: int = cfg.cpc_forward_subsample
        self.hidden_size: int = cfg.hidden_size
        self.num_actions: int = calc_num_actions(action_space)
        if isinstance(action_space, gym.spaces.Discrete):
            self.action_sizes = [action_space.n]
        else:
            self.action_sizes = [space.n for space in action_space.spaces]

        self.rnn = nn.GRU(32 * self.num_actions, cfg.hidden_size)
        self.action_embed = nn.Embedding(calc_num_logits(action_space), 32)

        self.predictor = nn.Sequential(
            nn.Linear(2 * self.hidden_size, self.hidden_size),
            nn.ReLU(True),
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.ReLU(True),
            nn.Linear(self.hidden_size, 1),
        )
示例#7
0
    def __init__(self, cfg, core_out_size, action_space):
        super().__init__(cfg, action_space)

        num_action_outputs = calc_num_logits(action_space)
        self.distribution_linear = nn.Linear(core_out_size, num_action_outputs)
    def __init__(self, cfg, num_agents, obs_space, action_space):
        self.cfg = cfg
        self.num_agents = num_agents
        self.envs_per_split = cfg.num_envs_per_worker // cfg.worker_num_splits
        self.num_traj_buffers = self.calc_num_trajectory_buffers()

        num_actions = calc_num_actions(action_space)
        num_action_logits = calc_num_logits(action_space)

        hidden_size = get_hidden_size(self.cfg)

        log.debug('Allocating shared memory for trajectories')
        self.tensors = TensorDict()

        # policy inputs
        obs_dict = TensorDict()
        self.tensors['obs'] = obs_dict
        if isinstance(obs_space, spaces.Dict):
            for space_name, space in obs_space.spaces.items():
                obs_dict[space_name] = self.init_tensor(space.dtype, space.shape)
        else:
            raise Exception('Only Dict observations spaces are supported')

        # env outputs
        self.tensors['rewards'] = self.init_tensor(torch.float32, [1])
        self.tensors['dones'] = self.init_tensor(torch.bool, [1])

        # policy outputs
        policy_outputs = [
            ('actions', num_actions),
            ('action_logits', num_action_logits),
            ('log_prob_actions', 1),
            ('values', 1),
            ('policy_version', 1),
            ('rnn_states', hidden_size)
        ]

        policy_outputs = [PolicyOutput(*po) for po in policy_outputs]
        policy_outputs = sorted(policy_outputs, key=lambda policy_output: policy_output.name)

        for po in policy_outputs:
            self.tensors[po.name] = self.init_tensor(torch.float32, [po.size])

        ensure_memory_shared(self.tensors)

        # this is for performance optimization
        # indexing in numpy arrays is faster than in PyTorch tensors
        self.tensors_individual_transitions = self.tensor_dict_to_numpy(len(self.tensor_dimensions()))
        self.tensor_trajectories = self.tensor_dict_to_numpy(len(self.tensor_dimensions()) - 1)

        # create a shared tensor to indicate when the learner is done with the trajectory buffer and
        # it can be used to store the next trajectory
        traj_buffer_available_shape = [
            self.cfg.num_workers,
            self.cfg.worker_num_splits,
            self.envs_per_split,
            self.num_agents,
            self.num_traj_buffers,
        ]
        self.is_traj_tensor_available = torch.ones(traj_buffer_available_shape, dtype=torch.uint8)
        self.is_traj_tensor_available.share_memory_()
        self.is_traj_tensor_available = to_numpy(self.is_traj_tensor_available, 2)

        # copying small policy outputs (e.g. individual value predictions & action logits) to shared memory is a
        # bottleneck on the policy worker. For optimization purposes we create additional tensors to hold
        # just concatenated policy outputs. Rollout workers parse the data and add it to the trajectory buffers
        # in a proper format
        policy_outputs_combined_size = sum(po.size for po in policy_outputs)
        policy_outputs_shape = [
            self.cfg.num_workers,
            self.cfg.worker_num_splits,
            self.envs_per_split,
            self.num_agents,
            policy_outputs_combined_size,
        ]

        self.policy_outputs = policy_outputs
        self.policy_output_tensors = torch.zeros(policy_outputs_shape, dtype=torch.float32)
        self.policy_output_tensors.share_memory_()
        self.policy_output_tensors = to_numpy(self.policy_output_tensors, 4)

        self.policy_versions = torch.zeros([self.cfg.num_policies], dtype=torch.int32)
        self.policy_versions.share_memory_()

        # a list of boolean flags to be shared among components that indicate that experience collection should be
        # temporarily stopped (e.g. due to too much experience accumulated on the learner)
        self.stop_experience_collection = torch.ones([self.cfg.num_policies], dtype=torch.bool)
        self.stop_experience_collection.share_memory_()
示例#9
0
    def __init__(self, cfg, num_agents, obs_space, action_space):
        self.cfg = cfg
        self.num_agents = num_agents
        self.envs_per_split = cfg.num_envs_per_worker // cfg.worker_num_splits
        self.num_traj_buffers = self.calc_num_trajectory_buffers()

        num_actions = calc_num_actions(action_space)
        num_action_logits = calc_num_logits(action_space)

        hidden_size = get_hidden_size(self.cfg)

        log.debug('Allocating shared memory for trajectories')
        self._tensors = TensorDict()

        # policy inputs
        obs_dict = TensorDict()
        self._tensors['obs'] = obs_dict
        if isinstance(obs_space, spaces.Dict):
            for space_name, space in obs_space.spaces.items():
                obs_dict[space_name] = self.init_tensor(
                    space.dtype, space.shape)
        else:
            raise Exception('Only Dict observations spaces are supported')

        # env outputs
        self._tensors['rewards'] = self.init_tensor(torch.float32, [1])
        self._tensors['rewards'].fill_(
            -42.42)  # if we're using uninitialized values it will be obvious
        self._tensors['dones'] = self.init_tensor(torch.bool, [1])
        self._tensors['dones'].fill_(True)
        self._tensors['policy_id'] = self.init_tensor(torch.int, [1])
        self._tensors['policy_id'].fill_(
            -1
        )  # -1 is an invalid policy index, experience from policy "-1" is always ignored

        # policy outputs
        policy_outputs = [('actions', num_actions),
                          ('action_logits', num_action_logits),
                          ('log_prob_actions', 1), ('values', 1),
                          ('policy_version', 1), ('rnn_states', hidden_size)]

        policy_outputs = [PolicyOutput(*po) for po in policy_outputs]
        policy_outputs = sorted(policy_outputs,
                                key=lambda policy_output: policy_output.name)

        for po in policy_outputs:
            self._tensors[po.name] = self.init_tensor(torch.float32, [po.size])

        ensure_memory_shared(self._tensors)

        # this is for performance optimization
        # indexing in numpy arrays is faster than in PyTorch tensors
        self.tensors = self.tensor_dict_to_numpy()

        # copying small policy outputs (e.g. individual value predictions & action logits) to shared memory is a
        # bottleneck on the policy worker. For optimization purposes we create additional tensors to hold
        # just concatenated policy outputs. Rollout workers parse the data and add it to the trajectory buffers
        # in a proper format
        policy_outputs_combined_size = sum(po.size for po in policy_outputs)
        policy_outputs_shape = [
            self.cfg.num_workers,
            self.cfg.worker_num_splits,
            self.envs_per_split,
            self.num_agents,
            policy_outputs_combined_size,
        ]

        self.policy_outputs = policy_outputs
        self._policy_output_tensors = torch.zeros(policy_outputs_shape,
                                                  dtype=torch.float32)
        self._policy_output_tensors.share_memory_()
        self.policy_output_tensors = self._policy_output_tensors.numpy()

        self._policy_versions = torch.zeros([self.cfg.num_policies],
                                            dtype=torch.int32)
        self._policy_versions.share_memory_()
        self.policy_versions = self._policy_versions.numpy()

        # a list of boolean flags to be shared among components that indicate that experience collection should be
        # temporarily stopped (e.g. due to too much experience accumulated on the learner)
        self._stop_experience_collection = torch.ones([self.cfg.num_policies],
                                                      dtype=torch.bool)
        self._stop_experience_collection.share_memory_()
        self.stop_experience_collection = self._stop_experience_collection.numpy(
        )

        queue_max_size_bytes = self.num_traj_buffers * 40  # 40 bytes to encode an int should be enough?
        self.free_buffers_queue = faster_fifo.Queue(
            max_size_bytes=queue_max_size_bytes)

        # since all buffers are initially free, we add all buffer indices to the queue
        self.free_buffers_queue.put_many_nowait(
            [int(i) for i in np.arange(self.num_traj_buffers)])