示例#1
0
 def test_int_key_sparse_to_dense(self):
     # int keys, set_missing_value_to_zero=False
     processor = PythonSparseToDenseProcessor(
         self.sorted_features, set_missing_value_to_zero=False)
     value, presence = processor.process(self.int_keyed_sparse_data)
     assert torch.allclose(value, self.expected_value_missing)
     assert torch.all(presence == self.expected_presence_missing)
示例#2
0
 def __init__(self, model) -> None:
     self.model = model
     self.state_internal_sparse_to_dense = PythonSparseToDenseProcessor(
         self.model.state_sorted_features())
     self.action_internal_sparse_to_dense = PythonSparseToDenseProcessor(
         self.model.action_sorted_features())
     self.softmax_temperature: Optional[float] = None
示例#3
0
    def test_create_df_from_replay_buffer(self):
        env_name = "MiniGrid-Empty-5x5-v0"
        env = Gym(env_name=env_name)
        state_dim = env.observation_space.shape[0]
        # Wrap env in TestEnv
        env = TestEnv(env)
        problem_domain = ProblemDomain.DISCRETE_ACTION
        DATASET_SIZE = 1000
        multi_steps = None
        DS = "2021-09-16"

        # Generate data
        df = create_df_from_replay_buffer(
            env=env,
            problem_domain=problem_domain,
            desired_size=DATASET_SIZE,
            multi_steps=multi_steps,
            ds=DS,
            shuffle_df=False,
        )
        self.assertEqual(len(df), DATASET_SIZE)

        # Check data
        preprocessor = PythonSparseToDenseProcessor(list(range(state_dim)))
        for idx, row in df.iterrows():
            df_mdp_id = row["mdp_id"]
            env_mdp_id = str(env.sart[idx][0])
            self.assertEqual(df_mdp_id, env_mdp_id)

            df_seq_num = row["sequence_number"]
            env_seq_num = env.sart[idx][1]
            self.assertEqual(df_seq_num, env_seq_num)

            df_state = preprocessor.process([row["state_features"]
                                             ])[0][0].numpy()
            env_state = env.sart[idx][2]
            npt.assert_array_equal(df_state, env_state)

            df_action = row["action"]
            env_action = str(env.sart[idx][3])
            self.assertEqual(df_action, env_action)

            df_terminal = row["next_action"] == ""
            env_terminal = env.sart[idx][5]
            self.assertEqual(df_terminal, env_terminal)
            if not df_terminal:
                df_reward = float(row["reward"])
                env_reward = float(env.sart[idx][4])
                npt.assert_allclose(df_reward, env_reward)

                df_next_state = preprocessor.process(
                    [row["next_state_features"]])[0][0].numpy()
                env_next_state = env.sart[idx + 1][2]
                npt.assert_array_equal(df_next_state, env_next_state)

                df_next_action = row["next_action"]
                env_next_action = str(env.sart[idx + 1][3])
                self.assertEqual(df_next_action, env_next_action)
            else:
                del env.sart[idx + 1]
示例#4
0
    def preprocess_samples(
        self,
        samples: Samples,
        minibatch_size: int,
        use_gpu: bool = False,
        one_hot_action: bool = True,
        normalize_actions: bool = True,
    ) -> List[TrainingDataPage]:
        logger.info("Shuffling...")
        samples = shuffle_samples(samples)

        logger.info("Sparse2Dense...")

        sorted_state_features, _ = sort_features_by_normalization(self.normalization)
        sorted_action_features, _ = sort_features_by_normalization(
            self.normalization_action
        )
        state_sparse_to_dense_processor = PythonSparseToDenseProcessor(
            sorted_state_features
        )
        action_sparse_to_dense_processor = PythonSparseToDenseProcessor(
            sorted_action_features
        )
        state_matrix, state_matrix_presence = state_sparse_to_dense_processor(
            samples.states
        )
        next_state_matrix, next_state_matrix_presence = state_sparse_to_dense_processor(
            samples.next_states
        )
        action_matrix, action_matrix_presence = action_sparse_to_dense_processor(
            samples.actions
        )
        (
            next_action_matrix,
            next_action_matrix_presence,
        ) = action_sparse_to_dense_processor(samples.next_actions)
        action_probabilities = torch.tensor(
            samples.action_probabilities, dtype=torch.float32
        ).reshape(-1, 1)
        rewards = torch.tensor(samples.rewards, dtype=torch.float32).reshape(-1, 1)

        max_action_size = 4

        pnas_mask_list: List[List[int]] = []
        pnas_flat: List[Dict[str, float]] = []
        for pnas in samples.possible_next_actions:
            pnas_mask_list.append([1] * len(pnas) + [0] * (max_action_size - len(pnas)))
            pnas_flat.extend(pnas)
            for _ in range(max_action_size - len(pnas)):
                pnas_flat.append({})  # Filler
        pnas_mask = torch.Tensor(pnas_mask_list)

        (
            possible_next_actions_matrix,
            possible_next_actions_matrix_presence,
        ) = action_sparse_to_dense_processor(pnas_flat)

        logger.info("Preprocessing...")
        state_preprocessor = Preprocessor(self.normalization, False)
        action_preprocessor = Preprocessor(self.normalization_action, False)

        states_ndarray = state_preprocessor(state_matrix, state_matrix_presence)

        if normalize_actions:
            actions_ndarray = action_preprocessor(action_matrix, action_matrix_presence)
        else:
            actions_ndarray = action_matrix

        next_states_ndarray = state_preprocessor(
            next_state_matrix, next_state_matrix_presence
        )

        state_pnas_tile = next_states_ndarray.repeat(1, max_action_size).reshape(
            -1, next_states_ndarray.shape[1]
        )

        if normalize_actions:
            next_actions_ndarray = action_preprocessor(
                next_action_matrix, next_action_matrix_presence
            )
        else:
            next_actions_ndarray = next_action_matrix

        if normalize_actions:
            logged_possible_next_actions = action_preprocessor(
                possible_next_actions_matrix, possible_next_actions_matrix_presence
            )
        else:
            logged_possible_next_actions = possible_next_actions_matrix

        assert state_pnas_tile.shape[0] == logged_possible_next_actions.shape[0], (
            "Invalid shapes: "
            + str(state_pnas_tile.shape)
            + " != "
            + str(logged_possible_next_actions.shape)
        )
        logged_possible_next_state_actions = torch.cat(
            (state_pnas_tile, logged_possible_next_actions), dim=1
        )

        logger.info("Reward Timeline to Torch...")
        time_diffs = torch.ones([len(samples.states), 1])

        tdps = []
        pnas_start = 0
        logger.info("Batching...")
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            pnas_end = pnas_start + (minibatch_size * max_action_size)
            tdp = TrainingDataPage(
                states=states_ndarray[start:end],
                actions=actions_ndarray[start:end],
                propensities=action_probabilities[start:end],
                rewards=rewards[start:end],
                next_states=next_states_ndarray[start:end],
                next_actions=next_actions_ndarray[start:end],
                not_terminal=(pnas_mask[start:end, :].sum(dim=1, keepdim=True) > 0),
                time_diffs=time_diffs[start:end],
                possible_next_actions_mask=pnas_mask[start:end, :],
                possible_next_actions_state_concat=logged_possible_next_state_actions[
                    pnas_start:pnas_end, :
                ],
            )
            pnas_start = pnas_end
            tdp.set_type(torch.cuda.FloatTensor if use_gpu else torch.FloatTensor)
            tdps.append(tdp)
        return tdps
示例#5
0
 def __init__(self, model, action_feature_ids: List[int]) -> None:
     self.model = model
     self.internal_sparse_to_dense = PythonSparseToDenseProcessor(
         self.model.state_sorted_features())
     self.action_feature_ids = action_feature_ids
示例#6
0
    def preprocess_samples_discrete(
        self,
        samples: Samples,
        minibatch_size: int,
        one_hot_action: bool = True,
        use_gpu: bool = False,
        do_shuffle: bool = True,
    ) -> List[TrainingDataPage]:

        if do_shuffle:
            logger.info("Shuffling...")
            samples = shuffle_samples(samples)

        logger.info("Preprocessing...")
        sorted_features, _ = sort_features_by_normalization(self.normalization)
        sparse_to_dense_processor = PythonSparseToDenseProcessor(
            sorted_features)

        state_matrix, state_matrix_presence = sparse_to_dense_processor(
            samples.states)
        next_state_matrix, next_state_matrix_presence = sparse_to_dense_processor(
            samples.next_states)

        logger.info("Converting to Torch...")
        actions_one_hot = torch.tensor((np.array(samples.actions).reshape(
            -1, 1) == np.array(self.ACTIONS)).astype(np.int64))
        actions = actions_one_hot.argmax(dim=1, keepdim=True)
        rewards = torch.tensor(samples.rewards,
                               dtype=torch.float32).reshape(-1, 1)
        action_probabilities = torch.tensor(samples.action_probabilities,
                                            dtype=torch.float32).reshape(
                                                -1, 1)
        next_actions_one_hot = torch.tensor(
            (np.array(samples.next_actions).reshape(-1, 1) == np.array(
                self.ACTIONS)).astype(np.int64))
        logger.info("Converting PA to Torch...")
        possible_action_strings = np.array(
            list(itertools.zip_longest(*samples.possible_actions,
                                       fillvalue=""))).T
        possible_actions_mask = torch.zeros(
            [len(samples.actions), len(self.ACTIONS)])
        for i, action in enumerate(self.ACTIONS):
            possible_actions_mask[:, i] = torch.tensor(
                np.max(possible_action_strings == action,
                       axis=1).astype(np.int64))
        logger.info("Converting PNA to Torch...")
        possible_next_action_strings = np.array(
            list(
                itertools.zip_longest(*samples.possible_next_actions,
                                      fillvalue=""))).T
        possible_next_actions_mask = torch.zeros(
            [len(samples.next_actions),
             len(self.ACTIONS)])
        for i, action in enumerate(self.ACTIONS):
            possible_next_actions_mask[:, i] = torch.tensor(
                np.max(possible_next_action_strings == action,
                       axis=1).astype(np.int64))
        terminals = torch.tensor(samples.terminals,
                                 dtype=torch.int32).reshape(-1, 1)
        not_terminal = 1 - terminals
        logger.info("Converting RT to Torch...")

        time_diffs = torch.ones([len(samples.states), 1])

        logger.info("Preprocessing...")
        preprocessor = Preprocessor(self.normalization, False)

        states_ndarray = preprocessor(state_matrix, state_matrix_presence)

        next_states_ndarray = preprocessor(next_state_matrix,
                                           next_state_matrix_presence)

        logger.info("Batching...")
        tdps = []
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            tdp = TrainingDataPage(
                states=states_ndarray[start:end],
                actions=actions_one_hot[start:end]
                if one_hot_action else actions[start:end],
                propensities=action_probabilities[start:end],
                rewards=rewards[start:end],
                next_states=next_states_ndarray[start:end],
                # pyre-fixme[16]: `int` has no attribute `__getitem__`.
                not_terminal=not_terminal[start:end],
                next_actions=next_actions_one_hot[start:end],
                possible_actions_mask=possible_actions_mask[start:end],
                possible_next_actions_mask=possible_next_actions_mask[
                    start:end],
                time_diffs=time_diffs[start:end],
            )
            tdp.set_type(
                torch.cuda.FloatTensor if use_gpu else torch.FloatTensor)
            tdps.append(tdp)
        return tdps