Exemplo n.º 1
0
    def __init__(self, cfg, core_out_size, action_space):
        super().__init__(cfg, action_space)

        self.num_action_outputs = calc_num_logits(action_space)
        self.num_options = cfg.num_options
        self.distribution_linear = nn.Linear(
            core_out_size, self.num_action_outputs * cfg.num_options)
    def test_gumbel_trick(self):
        """
        We use a Gumbel noise which seems to be faster compared to using pytorch multinomial.
        Here we test that those are actually equivalent.
        """

        timing = Timing()

        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = True

        with torch.no_grad():
            action_space = gym.spaces.Discrete(8)
            num_logits = calc_num_logits(action_space)
            device_type = 'cpu'
            device = torch.device(device_type)
            logits = torch.rand(self.batch_size, num_logits,
                                device=device) * 10.0 - 5.0

            if device_type == 'cuda':
                torch.cuda.synchronize(device)

            count_gumbel, count_multinomial = np.zeros(
                [action_space.n]), np.zeros([action_space.n])

            # estimate probability mass by actually sampling both ways
            num_samples = 20000

            action_distribution = get_action_distribution(action_space, logits)
            sample_actions_log_probs(action_distribution)
            action_distribution.sample_gumbel()

            with timing.add_time('gumbel'):
                for i in range(num_samples):
                    action_distribution = get_action_distribution(
                        action_space, logits)
                    samples_gumbel = action_distribution.sample_gumbel()
                    count_gumbel[samples_gumbel[0]] += 1

            action_distribution = get_action_distribution(action_space, logits)
            action_distribution.sample()

            with timing.add_time('multinomial'):
                for i in range(num_samples):
                    action_distribution = get_action_distribution(
                        action_space, logits)
                    samples_multinomial = action_distribution.sample()
                    count_multinomial[samples_multinomial[0]] += 1

            estimated_probs_gumbel = count_gumbel / float(num_samples)
            estimated_probs_multinomial = count_multinomial / float(
                num_samples)

            log.debug('Gumbel estimated probs: %r', estimated_probs_gumbel)
            log.debug('Multinomial estimated probs: %r',
                      estimated_probs_multinomial)
            log.debug('Sampling timing: %s', timing)
            time.sleep(0.1)  # to finish logging
    def test_tuple_sanity_check(self):
        num_spaces, num_actions = 3, 2
        simple_space = gym.spaces.Discrete(num_actions)
        spaces = [simple_space for _ in range(num_spaces)]
        tuple_space = gym.spaces.Tuple(spaces)

        self.assertTrue(calc_num_logits(tuple_space), num_spaces * num_actions)

        simple_logits = torch.zeros(1, num_actions)
        tuple_logits = torch.zeros(1, calc_num_logits(tuple_space))

        simple_distr = get_action_distribution(simple_space, simple_logits)
        tuple_distr = get_action_distribution(tuple_space, tuple_logits)

        tuple_entropy = tuple_distr.entropy()
        self.assertEqual(tuple_entropy, simple_distr.entropy() * num_spaces)

        simple_logprob = simple_distr.log_prob(torch.ones(1))
        tuple_logprob = tuple_distr.log_prob(torch.ones(1, num_spaces))
        self.assertEqual(tuple_logprob, simple_logprob * num_spaces)
    def test_simple_distribution(self):
        simple_action_space = gym.spaces.Discrete(3)
        simple_num_logits = calc_num_logits(simple_action_space)
        self.assertEqual(simple_num_logits, simple_action_space.n)

        simple_logits = torch.rand(self.batch_size, simple_num_logits)
        simple_action_distribution = get_action_distribution(
            simple_action_space, simple_logits)

        simple_actions = simple_action_distribution.sample()
        self.assertEqual(list(simple_actions.shape), [self.batch_size])
        self.assertTrue(
            all(0 <= a < simple_action_space.n for a in simple_actions))
Exemplo n.º 5
0
    def __init__(self, cfg, core_out_size, action_space):
        super().__init__(cfg, action_space)

        assert not cfg.adaptive_stddev
        assert is_continuous_action_space(self.action_space), \
            'Non-adaptive stddev makes sense only for continuous action spaces'

        num_action_outputs = calc_num_logits(action_space)

        # calculate only action means using the policy neural network
        self.distribution_linear = nn.Linear(core_out_size,
                                             num_action_outputs // 2)

        # stddev is a single learned parameter
        initial_stddev = torch.empty([num_action_outputs // 2])
        initial_stddev.fill_(math.log(self.cfg.initial_stddev))
        self.learned_stddev = nn.Parameter(initial_stddev, requires_grad=True)
Exemplo n.º 6
0
    def __init__(self, cfg, action_space):
        super().__init__()
        self.k: int = cfg.cpc_forward_steps
        self.time_subsample: int = cfg.cpc_time_subsample
        self.forward_subsample: int = cfg.cpc_forward_subsample
        self.hidden_size: int = cfg.hidden_size
        self.num_actions: int = calc_num_actions(action_space)
        if isinstance(action_space, gym.spaces.Discrete):
            self.action_sizes = [action_space.n]
        else:
            self.action_sizes = [space.n for space in action_space.spaces]

        self.rnn = nn.GRU(32 * self.num_actions, cfg.hidden_size)
        self.action_embed = nn.Embedding(calc_num_logits(action_space), 32)

        self.predictor = nn.Sequential(
            nn.Linear(2 * self.hidden_size, self.hidden_size),
            nn.ReLU(True),
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.ReLU(True),
            nn.Linear(self.hidden_size, 1),
        )
    def test_tuple_distribution(self):
        num_spaces = random.randint(1, 4)
        spaces = [
            gym.spaces.Discrete(random.randint(2, 5))
            for _ in range(num_spaces)
        ]
        action_space = gym.spaces.Tuple(spaces)

        num_logits = calc_num_logits(action_space)
        logits = torch.rand(self.batch_size, num_logits)

        self.assertEqual(num_logits, sum(s.n for s in action_space.spaces))

        action_distribution = get_action_distribution(action_space, logits)

        tuple_actions = action_distribution.sample()
        self.assertEqual(list(tuple_actions.shape),
                         [self.batch_size, num_spaces])

        log_probs = action_distribution.log_prob(tuple_actions)
        self.assertEqual(list(log_probs.shape), [self.batch_size])

        entropy = action_distribution.entropy()
        self.assertEqual(list(entropy.shape), [self.batch_size])
Exemplo n.º 8
0
    def __init__(self, cfg, num_agents, obs_space, action_space):
        self.cfg = cfg
        self.num_agents = num_agents
        self.envs_per_split = cfg.num_envs_per_worker // cfg.worker_num_splits
        self.num_traj_buffers = self.calc_num_trajectory_buffers()

        num_actions = calc_num_actions(action_space)
        num_action_logits = calc_num_logits(action_space)

        hidden_size = get_hidden_size(self.cfg)

        log.debug('Allocating shared memory for trajectories')
        self.tensors = TensorDict()

        # policy inputs
        obs_dict = TensorDict()
        self.tensors['obs'] = obs_dict
        if isinstance(obs_space, spaces.Dict):
            for space_name, space in obs_space.spaces.items():
                obs_dict[space_name] = self.init_tensor(space.dtype, space.shape)
        else:
            raise Exception('Only Dict observations spaces are supported')

        # env outputs
        self.tensors['rewards'] = self.init_tensor(torch.float32, [1])
        self.tensors['dones'] = self.init_tensor(torch.bool, [1])

        # policy outputs
        policy_outputs = [
            ('actions', num_actions),
            ('action_logits', num_action_logits),
            ('log_prob_actions', 1),
            ('values', 1),
            ('policy_version', 1),
            ('rnn_states', hidden_size)
        ]

        policy_outputs = [PolicyOutput(*po) for po in policy_outputs]
        policy_outputs = sorted(policy_outputs, key=lambda policy_output: policy_output.name)

        for po in policy_outputs:
            self.tensors[po.name] = self.init_tensor(torch.float32, [po.size])

        ensure_memory_shared(self.tensors)

        # this is for performance optimization
        # indexing in numpy arrays is faster than in PyTorch tensors
        self.tensors_individual_transitions = self.tensor_dict_to_numpy(len(self.tensor_dimensions()))
        self.tensor_trajectories = self.tensor_dict_to_numpy(len(self.tensor_dimensions()) - 1)

        # create a shared tensor to indicate when the learner is done with the trajectory buffer and
        # it can be used to store the next trajectory
        traj_buffer_available_shape = [
            self.cfg.num_workers,
            self.cfg.worker_num_splits,
            self.envs_per_split,
            self.num_agents,
            self.num_traj_buffers,
        ]
        self.is_traj_tensor_available = torch.ones(traj_buffer_available_shape, dtype=torch.uint8)
        self.is_traj_tensor_available.share_memory_()
        self.is_traj_tensor_available = to_numpy(self.is_traj_tensor_available, 2)

        # copying small policy outputs (e.g. individual value predictions & action logits) to shared memory is a
        # bottleneck on the policy worker. For optimization purposes we create additional tensors to hold
        # just concatenated policy outputs. Rollout workers parse the data and add it to the trajectory buffers
        # in a proper format
        policy_outputs_combined_size = sum(po.size for po in policy_outputs)
        policy_outputs_shape = [
            self.cfg.num_workers,
            self.cfg.worker_num_splits,
            self.envs_per_split,
            self.num_agents,
            policy_outputs_combined_size,
        ]

        self.policy_outputs = policy_outputs
        self.policy_output_tensors = torch.zeros(policy_outputs_shape, dtype=torch.float32)
        self.policy_output_tensors.share_memory_()
        self.policy_output_tensors = to_numpy(self.policy_output_tensors, 4)

        self.policy_versions = torch.zeros([self.cfg.num_policies], dtype=torch.int32)
        self.policy_versions.share_memory_()

        # a list of boolean flags to be shared among components that indicate that experience collection should be
        # temporarily stopped (e.g. due to too much experience accumulated on the learner)
        self.stop_experience_collection = torch.ones([self.cfg.num_policies], dtype=torch.bool)
        self.stop_experience_collection.share_memory_()