示例#1
0
    def __init__(self, config: Config):
        self.config = config
        self.hyperparameters = config.hyperparameters
        self.env = config.environment_make_function()

        self.memory = TorchReplayBuffer(
            size=10 ** 6,
            phi=config.phi,
            device=self.config.device,
        )
        state_shape = config.phi(self.env.reset()).shape
        self.action_size = self.env.action_space.shape[0]

        self.actor = StateAdaptiveActor(state_shape, self.action_size, self.config.device).to(self.config.device)
        self.actor_target: nn.Module = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.hyperparameters['lr'])

        self.critic = DoubleStateAdaptiveCritic(state_shape, self.action_size, self.config.device).to(self.config.device)
        self.critic_target: nn.Module = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.hyperparameters['lr'])

        self.total_it = 0

        self.stat_logger: Logger = Logger(config, log_interval=self.config.log_interval)

        self.episode_number = 0
        self.global_step_number = 0
        self._total_grad_steps = 0
        self.current_game_stats = None
        self.flush_stats()

        self.accumulated_reward_mean = None
        self.accumulated_reward_std = None

        self._exp_moving_track_progress = 0.0
def make_single_config(agent_config_dict) -> Config:
    config = Config()
    for name, value in agent_config_dict.items():
        if hasattr(config, name):
            setattr(config, name, value)
            continue
        config.hyperparameters.update({name: value})
    return config
示例#3
0
    def __init__(self, config: Config):
        raise ValueError("class under rewriting")
        self.name = config.name
        self.stat_logger: Logger = Logger(
            config,
            log_interval=config.log_interval,
        )
        self.config = config
        self.hyperparameters = config.hyperparameters
        self.eps_clip = config.hyperparameters['eps_clip']

        self.test_env = config.test_environment_make_function()
        self.action_size = self.test_env.action_space.shape[0]
        self.env = None
        self.create_env()

        self.memory = Torch_Arbitrary_Replay_Buffer(
            buffer_size=10**4,
            batch_size=10**4,
            phi=config.phi,
            seed=0,
            device=self.config.device,
            sample_order=[
                'state', 'action', 'reward', 'log_prob', 'done', 'next_state'
            ],
            do_it_auto=False,
        )

        state_shape = config.phi(self.test_env.reset()).shape
        action_size = self.test_env.action_space.shape[0]

        if self.hyperparameters['use_icm']:
            self._icm: ICM = ICM(state_description=state_description,
                                 action_size=action_size,
                                 encoded_state_size=6,
                                 device=self.device,
                                 batch_size=256,
                                 buffer_size=10**5,
                                 update_per_step=1,
                                 config=config.hyperparameters['icm_config'])
        self.ac: ActorCritic = ActorCritic(
            state_shape=state_shape,
            action_size=action_size,
            hidden_size=128,
            device=self.config.device,
            action_std=0.5,
            double_action_size_on_output=False,
        )
        self.optimizer = torch.optim.Adam(
            self.ac.parameters(),
            # chain(
            #     self.ac.parameters(),
            #     self._icm.parameters()
            # ) if self.hyperparameters['use_icm'] else self.ac.parameters(),
            lr=config.hyperparameters['lr'],
            betas=config.hyperparameters['betas'],
        )

        self.ac_old: ActorCritic = ActorCritic(
            state_shape=state_shape,
            action_size=action_size,
            hidden_size=128,
            device=self.config.device,
            action_std=0.5,
            double_action_size_on_output=False,
        )
        self.update_old_policy()
        self.mse = nn.MSELoss()

        self.folder_save_path = os.path.join('model_saves', 'PPO', self.name)
        self.episode_number = 0
        self.global_step_number = 0
        self._total_grad_steps = 0
        self.current_game_stats = None
        self.flush_stats()

        self.accumulated_reward_mean = None
        self.accumulated_reward_std = None

        self._exp_moving_track_progress = 0.0
    def __init__(self, config: Config):
        self.name = config.name
        self.stat_logger: Logger = Logger(
            config,
            log_interval=config.log_interval,
        )
        self.config = config
        self.hyperparameters = config.hyperparameters
        self.eps_clip = config.hyperparameters['eps_clip']

        self.test_env = config.test_environment_make_function()
        self.env = None
        self.create_env()

        self.memory = Torch_Arbitrary_Replay_Buffer(
            buffer_size=10**4,
            batch_size=10**4,
            phi=None,
            seed=0,
            device=self.config.device,
            sample_order=[
                'state', 'action', 'reward', 'log_prob', 'done', 'next_state'
            ],
            do_it_auto=False,
            convert_to_torch=False,
        )

        state_shape = config.phi(self.test_env.reset()).shape
        print(f'state shape : {state_shape}')
        action_size = self.test_env.action_space.shape[0]

        self.ac: ActorCritic = ActorCritic(
            state_shape=state_shape,
            action_size=action_size,
            hidden_size=128,
            device=self.config.device,
            action_std=0.5,
            double_action_size_on_output=False,
        )
        self.optimizer = torch.optim.Adam(
            self.ac.parameters(),
            lr=config.hyperparameters['lr'],
            betas=config.hyperparameters['betas'],
        )

        self.ac_old: ActorCritic = ActorCritic(
            state_shape=state_shape,
            action_size=action_size,
            hidden_size=128,
            device=self.config.device,
            action_std=0.5,
            double_action_size_on_output=False,
        )
        self.update_old_policy()
        self.mse = nn.MSELoss()

        # self.image_transform = transforms.Compose([
        #     transforms.ToPILImage(),
        #     transforms.RandomCrop(
        #         (84, 84),
        #         padding=self.hyperparameters['drq_padding'],
        #         pad_if_needed=True,
        #         padding_mode='edge',
        #     ),
        #     transforms.ToTensor(),
        # ])
        self.image_transform = nn.Sequential(
            nn.ReplicationPad2d(self.hyperparameters['drq_padding']),
            kornia.augmentation.RandomCrop((84, 84)),
        )

        self.folder_save_path = os.path.join('model_saves', 'PPO', self.name)
        self.episode_number = 0
        self.global_step_number = 0
        self._total_grad_steps = 0
        self.current_game_stats = None
        self._wandb_anim_save = 0
        self.flush_stats()

        self.accumulated_reward_mean = None
        self.accumulated_reward_std = None

        self._exp_moving_track_progress = 0.0
    def __init__(self, config: Config):
        self.name = config.name
        self.stat_logger: Logger = Logger(config,
                                          log_interval=config.log_interval)
        self.config: Config = config
        self.hyperparameters = config.hyperparameters
        self.eps_clip = config.hyperparameters['eps_clip']

        self.test_env = config.test_environment_make_function()
        self.action_size = self.test_env.action_space.shape[0]
        self.env = None
        self.create_env()

        self.memory = Torch_Arbitrary_Replay_Buffer(
            buffer_size=10**4,
            batch_size=10**4,
            phi=config.phi,
            seed=0,
            device=self.config.device,
            sample_order=[
                'state', 'action', 'reward', 'log_prob', 'done', 'next_state'
            ],
            do_it_auto=False,
        )

        state_shape = config.phi(self.test_env.reset()).shape
        action_size = self.test_env.action_space.shape[0]

        self.ac: ActorCritic = ActorCritic(
            state_shape=state_shape,
            action_size=action_size,
            hidden_size=128,
            device=self.config.device,
            action_std=0.5,
            double_action_size_on_output=False,
        )
        self.optimizer = torch.optim.Adam(
            self.ac.parameters(),
            lr=config.hyperparameters['lr'],
            betas=config.hyperparameters['betas'],
        )

        self.ac_old: ActorCritic = ActorCritic(
            state_shape=state_shape,
            action_size=action_size,
            hidden_size=128,
            device=self.config.device,
            action_std=0.5,
            double_action_size_on_output=False,
        )
        self.update_old_policy()
        self.mse = nn.MSELoss()

        self.folder_save_path = os.path.join('model_saves', 'PPO', self.name)
        self.episode_number = 0
        self.global_step_number = 0
        self._total_grad_steps = 0
        self._wandb_anim_save = 0

        self.accumulated_reward_mean = None
        self.accumulated_reward_std = None

        self._exp_moving_track_progress = 0.0
示例#6
0
    def __init__(self, config: Config):
        self.name = config.name
        # self.tf_writer = config.tf_writer
        self.config = config
        self.environment = config.environment_make_function()
        self.action_size = self.environment.action_space.shape[0]

        self.hyperparameters = config.hyperparameters

        self.folder_save_path = os.path.join('model_saves', 'SAC', self.name)

        self.critic_local = QNet(
            state_description=self.environment.observation_space,
            action_size=self.action_size,
            hidden_size=256,
            device=self.config.device,
        )
        self.critic_local_2 = QNet(
            state_description=self.environment.observation_space,
            action_size=self.action_size,
            hidden_size=256,
            device=self.config.device,
        )

        self.critic_optimizer = torch.optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4,
        )
        self.critic_optimizer_2 = torch.optim.Adam(
            self.critic_local_2.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4,
        )

        self.critic_target = QNet(
            state_description=self.environment.observation_space,
            action_size=self.action_size,
            hidden_size=256,
            device=self.config.device,
        )
        self.critic_target_2 = QNet(
            state_description=self.environment.observation_space,
            action_size=self.action_size,
            hidden_size=256,
            device=self.config.device,
        )
        SAC.copy_model_over(self.critic_local, self.critic_target)
        SAC.copy_model_over(self.critic_local_2, self.critic_target_2)

        self.memory = Torch_Separated_Replay_Buffer(
            self.hyperparameters["buffer_size"],
            self.hyperparameters["batch_size"],
            self.hyperparameters["seed"],
            device=self.config.device,
            state_extractor=get_state_combiner_by_settings_file(
                self.hyperparameters['env_settings_file_path']),
            state_producer=from_image_vector_to_combined_state,
        )

        self.actor_local = Policy(
            state_description=self.environment.observation_space,
            action_size=self.action_size,
            hidden_size=256,
            device=self.config.device,
        )
        self.actor_optimizer = torch.optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)

        self.target_entropy = -torch.prod(
            torch.Tensor(self.environment.action_space.shape).to(
                self.device)).item()  # heuristic value from the paper
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha = self.log_alpha.exp()
        self.alpha_optim = Adam(
            [self.log_alpha],
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)

        self._game_stats = {}
        self._last_episode_save_count = 0
        self._current_run_global_steps = 0
        self.episode_number = 0
        self.global_step_number = 0
示例#7
0
    def __init__(self, config: Config):
        print('start to init rainbow')
        self.config = config
        self.name = config.name
        self.hyperparameters = config.hyperparameters

        self.stat_logger: Logger = Logger(
            config,
            log_interval=config.log_interval *\
                         (1 + self.hyperparameters['parallel_env_num'] * int(self.hyperparameters['use_parallel_envs'])),
        )
        if self.hyperparameters['use_parallel_envs']:
            self.env = SubprocVecEnv_tf2(
                [
                    config.environment_make_function
                    for _ in range(self.hyperparameters['parallel_env_num'])
                ],
                state_flatter=None,
            )
        else:
            self.env = config.environment_make_function()

        self.test_env = config.test_environment_make_function()

        # function to prepare row observation to chainer format
        print(f"rainbow mode : {self.config.mode}")

        n_actions = self.test_env.action_space.n

        n_atoms = 51
        v_max = 10
        v_min = -10
        q_func = DistributionalDuelingDQN_VectorPicture(
            config.phi(self.test_env.reset()).shape,
            n_actions,
            n_atoms,
            v_min,
            v_max,
        )

        # Noisy nets
        links.to_factorized_noisy(
            q_func, sigma_scale=self.hyperparameters['noisy_net_sigma'])
        # Turn off explorer
        explorer = explorers.Greedy()

        # Draw the computational graph and save it in the output directory.
        # chainerrl.misc.draw_computational_graph(
        #     [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        #     os.path.join(args.outdir, 'model'))

        # Use the same hyper parameters as https://arxiv.org/abs/1707.06887
        opt = chainer.optimizers.Adam(self.hyperparameters['lr'],
                                      eps=1.5 * 10**-4)
        opt.setup(q_func)

        # Prioritized Replay
        # Anneal beta from beta0 to 1 throughout training
        update_interval = 4
        betasteps = self.config.env_steps_to_run / update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(
            10**6,
            alpha=0.5,
            beta0=0.4,
            betasteps=betasteps,
            num_steps=3,
            normalize_by_max='memory',
        )

        self.agent = agents.CategoricalDoubleDQN(
            q_func,
            opt,
            rbuf,
            gpu=self.config.rainbow_gpu,
            gamma=0.99,
            explorer=explorer,
            minibatch_size=32,
            replay_start_size=self.hyperparameters['replay_start_size'],
            target_update_interval=16000,
            update_interval=update_interval,
            batch_accumulator='mean',
            phi=config.phi,
        )

        # self.folder_save_path = os.path.join('model_saves', 'Rainbow', self.name)
        self.episode_number = 0
        self.global_step_number = 0
        self.batch_step_number = 0
        self._total_grad_steps = 0
        self.current_game_stats = None
        self.flush_stats()
        # self.tf_writer = config.tf_writer

        self.accumulated_reward_mean = None
        self.accumulated_reward_std = None

        self._exp_moving_track_progress = 0.0