Пример #1
0
def create_embed_rl_dataset(
    gym_env: OpenAIGymEnvironment,
    trainer: MDNRNNTrainer,
    dataset: RLDataset,
    use_gpu: bool,
    run_details: OpenAiRunDetails,
):
    assert run_details.max_steps is not None
    old_mdnrnn_mode = trainer.mdnrnn.mdnrnn.training
    trainer.mdnrnn.mdnrnn.eval()
    num_transitions = run_details.num_state_embed_episodes * run_details.max_steps
    device = torch.device("cuda") if use_gpu else torch.device(
        "cpu")  # type: ignore

    (
        state_batch,
        action_batch,
        reward_batch,
        next_state_batch,
        next_action_batch,
        not_terminal_batch,
        step_batch,
        next_step_batch,
    ) = map(
        list,
        zip(*multi_step_sample_generator(
            gym_env=gym_env,
            num_transitions=num_transitions,
            max_steps=run_details.max_steps,
            # +1 because MDNRNN embeds the first seq_len steps and then
            # the embedded state will be concatenated with the last step
            multi_steps=run_details.seq_len + 1,
            include_shorter_samples_at_start=True,
            include_shorter_samples_at_end=False,
        )),
    )

    def concat_batch(batch):
        return torch.cat(
            [
                torch.tensor(np.expand_dims(x, axis=1),
                             dtype=torch.float,
                             device=device) for x in batch
            ],
            dim=1,
        )

    # shape: seq_len x batch_size x feature_dim
    mdnrnn_state = concat_batch(state_batch)
    next_mdnrnn_state = concat_batch(next_state_batch)
    mdnrnn_action = concat_batch(action_batch)
    next_mdnrnn_action = concat_batch(next_action_batch)

    mdnrnn_input = rlt.PreprocessedStateAction.from_tensors(
        state=mdnrnn_state, action=mdnrnn_action)
    next_mdnrnn_input = rlt.PreprocessedStateAction.from_tensors(
        state=next_mdnrnn_state, action=next_mdnrnn_action)
    # batch-compute state embedding
    mdnrnn_output = trainer.mdnrnn(mdnrnn_input)
    next_mdnrnn_output = trainer.mdnrnn(next_mdnrnn_input)

    for i in range(len(state_batch)):
        # Embed the state as the hidden layer's output
        # until the previous step + current state
        hidden_idx = 0 if step_batch[
            i] == 1 else step_batch[i] - 2  # type: ignore
        next_hidden_idx = next_step_batch[i] - 2  # type: ignore
        hidden_embed = (
            mdnrnn_output.all_steps_lstm_hidden[hidden_idx,
                                                i, :].squeeze().detach().cpu())
        state_embed = torch.cat(
            (hidden_embed, torch.tensor(state_batch[i][hidden_idx + 1])
             )  # type: ignore
        )
        next_hidden_embed = (next_mdnrnn_output.all_steps_lstm_hidden[
            next_hidden_idx, i, :].squeeze().detach().cpu())
        next_state_embed = torch.cat((
            next_hidden_embed,
            torch.tensor(next_state_batch[i][next_hidden_idx +
                                             1]),  # type: ignore
        ))

        logger.debug(
            "create_embed_rl_dataset:\nstate batch\n{}\naction batch\n{}\nlast "
            "action: {},reward: {}\nstate embed {}\nnext state embed {}\n".
            format(
                state_batch[i][:hidden_idx + 1],  # type: ignore
                action_batch[i][:hidden_idx + 1],  # type: ignore
                action_batch[i][hidden_idx + 1],  # type: ignore
                reward_batch[i][hidden_idx + 1],  # type: ignore
                state_embed,
                next_state_embed,
            ))

        terminal = 1 - not_terminal_batch[i][hidden_idx + 1]  # type: ignore
        possible_actions, possible_actions_mask = get_possible_actions(
            gym_env, ModelType.PYTORCH_PARAMETRIC_DQN.value, False)
        possible_next_actions, possible_next_actions_mask = get_possible_actions(
            gym_env, ModelType.PYTORCH_PARAMETRIC_DQN.value, terminal)
        dataset.insert(
            state=state_embed,
            action=torch.tensor(action_batch[i][hidden_idx +
                                                1]),  # type: ignore
            reward=reward_batch[i][hidden_idx + 1],  # type: ignore
            next_state=next_state_embed,
            next_action=torch.tensor(next_action_batch[i][next_hidden_idx +
                                                          1]  # type: ignore
                                     ),
            terminal=torch.tensor(terminal),
            possible_next_actions=possible_next_actions,
            possible_next_actions_mask=possible_next_actions_mask,
            time_diff=torch.tensor(1),
            possible_actions=possible_actions,
            possible_actions_mask=possible_actions_mask,
            policy_id=0,
        )
    logger.info("Insert {} transitions into a state embed dataset".format(
        len(state_batch)))
    trainer.mdnrnn.mdnrnn.train(old_mdnrnn_mode)
    return dataset
Пример #2
0
def custom_train_gym_online_rl(
        c2_device, gym_env, replay_buffer, model_type, trainer, predictor,
        test_run_name, score_bar, num_episodes, max_steps, train_every_ts,
        train_after_ts, test_every_ts, test_after_ts, num_train_batches,
        avg_over_num_episodes, render, save_timesteps_to_dataset,
        start_saving_from_score, solved_reward_threshold,
        max_episodes_to_run_after_solved, stop_training_after_solved,
        timesteps_total, checkpoint_after_ts, avg_over_num_steps):
    """Train off of dynamic set of transitions generated on-policy."""
    ep_i = 0
    ts = 0
    policy_id = 0
    # logging
    average_reward_train, num_episodes_train = [], []
    average_reward_eval, num_episodes_eval = [], []
    timesteps_history = []
    reward_hist = list()
    while ep_i < num_episodes and ts < timesteps_total:
        terminal = False
        next_state = gym_env.transform_state(gym_env.env.reset())
        next_action, next_action_probability = gym_env.policy(
            predictor, next_state, False)
        reward_sum = 0
        ep_timesteps = 0

        if model_type == ModelType.CONTINUOUS_ACTION.value:
            trainer.noise.clear()

        while not terminal:
            state = next_state
            action = next_action
            action_probability = next_action_probability

            # Get possible actions
            possible_actions, _ = horizon_runner.get_possible_actions(
                gym_env, model_type, terminal)

            if render:
                gym_env.env.render()

            timeline_format_action, gym_action = horizon_runner._format_action_for_log_and_gym(
                action, gym_env.action_type, model_type)
            next_state, reward, terminal, _ = gym_env.env.step(gym_action)
            next_state = gym_env.transform_state(next_state)

            ep_timesteps += 1
            ts += 1
            next_action, next_action_probability = gym_env.policy(
                predictor, next_state, False)
            reward_sum += reward

            (possible_actions,
             possible_actions_mask) = horizon_runner.get_possible_actions(
                 gym_env, model_type, False)

            # Get possible next actions
            (possible_next_actions,
             possible_next_actions_mask) = horizon_runner.get_possible_actions(
                 gym_env, model_type, terminal)

            replay_buffer.insert_into_memory(
                np.float32(state),
                action,
                np.float32(reward),
                np.float32(next_state),
                next_action,
                terminal,
                possible_next_actions,
                possible_next_actions_mask,
                1,
                possible_actions,
                possible_actions_mask,
                policy_id,
            )

            if save_timesteps_to_dataset and (ts % checkpoint_after_ts == 0
                                              or ts == timesteps_total):
                save_timesteps_to_dataset.insert(
                    mdp_id=ep_i,
                    sequence_number=ep_timesteps - 1,
                    state=state,
                    action=action,
                    timeline_format_action=timeline_format_action,
                    action_probability=action_probability,
                    reward=reward,
                    next_state=next_state,
                    next_action=next_action,
                    terminal=terminal,
                    possible_next_actions=possible_next_actions,
                    possible_next_actions_mask=possible_next_actions_mask,
                    time_diff=1,
                    possible_actions=possible_actions,
                    possible_actions_mask=possible_actions_mask,
                    policy_id=policy_id,
                )

            # Training loop
            if (ts % train_every_ts == 0 and ts > train_after_ts and len(
                    replay_buffer.replay_memory) >= trainer.minibatch_size):
                for _ in range(num_train_batches):
                    samples = replay_buffer.sample_memories(
                        trainer.minibatch_size, model_type)
                    samples.set_type(trainer.dtype)
                    trainer.train(samples)
                    # Every time we train, the policy changes
                    policy_id += 1

            # Evaluation loop
            if ts % test_every_ts == 0 and ts > test_after_ts:
                avg_ep_count, avg_rewards = gym_env.run_n_steps(
                    avg_over_num_steps, predictor, test=True)

                # save Tensorboard statistics
                timesteps_history.append(ts)
                avg_train_reward = sum(reward_hist) / len(reward_hist)
                average_reward_train.append(avg_train_reward)
                num_episodes_train.append(len(reward_hist))
                average_reward_eval.append(avg_rewards)
                num_episodes_eval.append(avg_ep_count)

                logger.info(
                    "Achieved an average reward score of {} over {} evaluations."
                    " Total episodes: {}, total timesteps: {}.".format(
                        avg_rewards, avg_ep_count, ep_i + 1, ts))
                logger.info(
                    "Achieved an average reward score of {} during {} training episodes."
                    " Total episodes: {}, total timesteps: {}.".format(
                        avg_train_reward, len(reward_hist), ep_i + 1, ts))
                reward_hist.clear()
                if score_bar is not None and avg_rewards > score_bar:
                    logger.info(
                        "Avg. reward history during evaluation for {}: {}".
                        format(test_run_name, average_reward_eval))
                    logger.info(
                        "Avg. reward history during training for {}: {}".
                        format(test_run_name, average_reward_train))
                    return average_reward_train, num_episodes_train, average_reward_eval, num_episodes_eval, timesteps_history, trainer, predictor, gym_env

            if max_steps and ep_timesteps >= max_steps:
                break
        reward_hist.append(reward_sum)

        # Always eval on last episode if previous eval loop didn't return.
        if ep_i == num_episodes - 1:
            avg_ep_count, avg_rewards = gym_env.run_n_steps(avg_over_num_steps,
                                                            predictor,
                                                            test=True)

            # save Tensorboard statistics
            timesteps_history.append(ts)
            avg_train_reward = sum(reward_hist) / len(reward_hist)
            average_reward_train.append(avg_train_reward)
            num_episodes_train.append(len(reward_hist))
            average_reward_eval.append(avg_rewards)
            num_episodes_eval.append(avg_ep_count)

            logger.info(
                "Achieved an average reward score of {} over {} evaluations."
                " Total episodes: {}, total timesteps: {}.".format(
                    avg_rewards, avg_ep_count, ep_i + 1, ts))

            logger.info(
                "Achieved an average reward score of {} during {} training episodes."
                " Total episodes: {}, total timesteps: {}.".format(
                    avg_train_reward, len(reward_hist), ep_i + 1, ts))
            reward_hist.clear()

        gym_env.decay_epsilon()
        ep_i += 1

    logger.info("Avg. reward history during evaluation for {}: {}".format(
        test_run_name, average_reward_eval))
    logger.info("Avg. reward history during training for {}: {}".format(
        test_run_name, average_reward_train))
    return average_reward_train, num_episodes_train, average_reward_eval, num_episodes_eval, timesteps_history, trainer, predictor, gym_env