Пример #1
0
    def __init__(
        self,
        state_shape: Tuple[int, ...],
        action_size: int,
        model: BaseModel,
        policy: Policy,
        memory: PrioritizedMemory,
        lr_scheduler: _LRScheduler,
        optimizer: torch.optim.Optimizer,
        batch_size: int = 32,
        gamma: float = 0.95,
        tau: float = 1e-3,
        update_frequency: int = 5,
        seed: int = None,
        action_repeats: int = 1,
        gradient_clip: float = 1,
    ):
        """Initialize an Agent object.

        Args:
            state_shape (Tuple[int, ...]): Shape of the state
            action_size (int): Number of possible integer actions
            model (torch.nn.Module): Model producing actions from state
            policy (Policy):
            memory: Memory,
            lr_scheduler: _LRScheduler,
            optimizer: torch.optim.Optimizer,
            batch_size: int = 32,
            gamma: float = 0.95,
            tau: float = 1e-3,
            update_frequency: int = 5,
            seed: int = None
        """
        super().__init__(action_size=action_size, state_shape=state_shape)

        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.update_frequency = update_frequency
        self.gradient_clip = gradient_clip

        self.previous_action: Optional[Action] = None
        self.action_repeats = action_repeats

        # Double DQN
        self.online_qnetwork = model.to(device)
        self.target_qnetwork = deepcopy(model).to(device).eval()

        self.memory = memory

        self.losses = []

        self.policy: Policy = policy
        self.optimizer: optimizer = optimizer
        self.lr_scheduler: _LRScheduler = lr_scheduler

        if seed:
            set_seed(seed)
            self.online_qnetwork.set_seed(seed)
            self.target_qnetwork.set_seed(seed)
Пример #2
0
    def __init__(self, stream_ids: List[str], capacity, seed=None):
        self.streams: Dict[str, Memory] = {}
        if seed:
            set_seed(seed)

        for s in stream_ids:
            self.streams[s] = Memory(capacity, seed)
Пример #3
0
    def __init__(self, task_name: str, env: UnityEnvironment, seed: int):
        set_seed(seed)
        self.env = env
        self.task_name = task_name

        self.env_info = None
        self.training_scores = None
        self.evaluation_scores = None
Пример #4
0
 def __init__(self, actor_model, critic_model, action_size, continuous_actions: bool, initial_std=0.2, continuous_action_range_clip: Optional[tuple] = (-1, 1), seed=None):
     super(MAPPO_Actor_Critic, self).__init__()
     if seed is not None:
         set_seed(seed)
     self.actor = actor_model
     self.critic = critic_model
     self.action_size = action_size
     self.continuous_actions = continuous_actions
     self.std = nn.Parameter(torch.ones(1, action_size) * initial_std)
     self.continuous_action_range_clip = continuous_action_range_clip
Пример #5
0
 def __init__(self, stream_ids: List[str], capacity, state_shape, beta_scheduler, alpha_scheduler,
              min_priority: Optional[float] = None, num_stacked_frames=1, seed=None, continuous_actions=False):
     self.streams: Dict[str, PrioritizedMemory] = {}
     if seed:
         set_seed(seed)
     for s in stream_ids:
         self.streams[s] = ExtendedPrioritizedMemory(
             capacity,
             state_shape,
             beta_scheduler,
             alpha_scheduler,
             min_priority=min_priority,
             num_stacked_frames=num_stacked_frames,
             seed=seed,
             continuous_actions=continuous_actions
         )
Пример #6
0
    def __init__(self, state_shape, action_size, seed,
                 map_agent_to_state_slice, map_agent_to_action_slice):
        """Initialize an Agent object.

        Params
        ======
            state_shape (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        super().__init__(action_size=action_size, state_shape=state_shape)
        if seed is not None:
            set_seed(seed)
        self.target_actor = lambda x: torch.randint(0, self.action_size + 1,
                                                    (len(x), 1)).to(device)
        self.online_actor = lambda x: torch.randint(0, self.action_size + 1,
                                                    (len(x), 1)).to(device)
        self.online_critic = {}
        self.map_agent_to_state_slice = map_agent_to_state_slice
        self.map_agent_to_action_slice = map_agent_to_action_slice
Пример #7
0
    def __init__(self, capacity: int, state_shape: tuple, beta_scheduler: ParameterScheduler, alpha_scheduler: ParameterScheduler,
                 min_priority: float = 1e-3, seed: int = None, continuous_actions: bool = False, ):
        self.capacity = capacity

        self.state_shape = state_shape
        self.curr_write_idx = 0
        self.available_samples = 0

        # Memory buffer and priority sum-tree
        self.buffer = ReplayBuffer(state_shape, capacity)
        self.sum_tree = SumTree([0 for _ in range(self.capacity)])

        self.beta_scheduler = beta_scheduler
        self.alpha_scheduler = alpha_scheduler

        self.beta = beta_scheduler.initial
        self.alpha = alpha_scheduler.initial

        self.min_priority = min_priority

        self.continuous_actions = continuous_actions

        if seed:
            set_seed(seed)
Пример #8
0
 def set_seed(seed: int):
     set_seed(seed)
Пример #9
0
 def __init__(self, seed):
     """"""
     set_seed(seed)
     self.memory = []
Пример #10
0
 def set_seed(self, seed: int):
     """ Set seed of model for consistency """
     set_seed(seed)
Пример #11
0
 def set_seed(seed):
     if seed:
         set_seed(seed)
Пример #12
0
    def __init__(
        self,
        state_size: int,
        action_size: int,
        seed: int,
        actor_critic_factory: Callable[[], PPO_Actor_Critic],
        optimizer_factory: Callable[[torch.nn.Module.parameters],
                                    torch.optim.Optimizer],
        grad_clip: float = 1.,
        gamma: float = 0.99,
        batch_size: int = 1024,
        gae_factor: float = 0.95,
        epsilon: float = 0.2,
        beta_scheduler: ParameterScheduler = ParameterScheduler(
            initial=0.02, lambda_fn=lambda i: 0.02 * 0.995**i, final=1e-4),
        std_scale_scheduler: ParameterScheduler = ParameterScheduler(
            initial=0.5, lambda_fn=lambda i: 0.5 * 0.995**i, final=0.2),
        continuous_actions: bool = False,
        continuous_action_range_clip: tuple = (-1, 1),
        min_batches_for_training: int = 32,
        num_learning_updates: int = 4,
    ):
        """
        :param state_size: The state size of the agent
        :param action_size: The action size of the agent
        :param seed: Seed for reproducibility
        :param actor_critic_factory: Function returning the actor-critic model
        :param optimizer_factory: Function returning the optimizer for the actor-critic model
        :param grad_clip: Clip absolute value of the gradient above this value
        :param gamma: Discount factor
        :param batch_size: SGD minibatch size
        :param gae_factor: Factor used to down-weight rewards, presented as lambda in the GAE paper
        :param epsilon: Small constant parameter to clip the objective function by
        :param beta_scheduler: Scheduler for parameter beta, the coefficient for the entropy term
        :param std_scale_scheduler: Scheduler for the std of the normal distribution used to sample
            actions from in the policy network. Only used for continuous actions
        :param continuous_actions: Whether the action space is continuous or discrete
        :param continuous_action_range_clip: The range to clip continuous actions above. Only used for continuous actions
        :param min_batches_for_training: Minimum number of batches to accumulate before performing training
        :param num_learning_updates: Number of epochs to train for over before discarding samples
        """
        super().__init__(state_size, action_size)

        if seed is not None:
            set_seed(seed)
        self.online_actor_critic = actor_critic_factory().to(device)
        self.target_actor_critic = actor_critic_factory().to(device).eval()
        self.target_actor_critic.load_state_dict(
            self.online_actor_critic.state_dict())

        self.optimizer = optimizer_factory(
            self.online_actor_critic.parameters())
        self.current_trajectory_memory = Trajectories(seed)
        self.grad_clip = grad_clip
        self.gamma = gamma
        self.batch_size = batch_size
        self.gae_factor = gae_factor

        self.beta_scheduler = beta_scheduler

        self.epsilon = epsilon
        self.beta = self.beta_scheduler.initial
        self.std_scale_scheduler = std_scale_scheduler
        self.std_scale = self.std_scale_scheduler.initial
        self.previous_std_scale = None

        self.continuous_actions = continuous_actions
        self.continuous_action_range_clip = continuous_action_range_clip

        self.min_batches_for_training = min_batches_for_training

        self.num_learning_updates = num_learning_updates

        self.warmup = False
        self.current_trajectory = []
Пример #13
0
    def __init__(self,
                 agent_id,
                 policy,
                 state_shape,
                 action_size,
                 seed,
                 critic_factory: Callable,
                 actor_factory: Callable,
                 critic_optimizer_factory: Callable,
                 actor_optimizer_factory: Callable,
                 memory_factory: Callable,
                 num_learning_updates=10,
                 tau: float = 1e-2,
                 batch_size: int = 512,
                 update_frequency: int = 20,
                 critic_grad_norm_clip: int = 1,
                 policy_update_frequency: int = 2,
                 homogeneous_agents: bool = False):

        super().__init__(action_size=action_size, state_shape=state_shape)
        if seed is not None:
            set_seed(seed)
        self.n_seed = np.random.seed(seed)
        self.num_learning_updates = num_learning_updates
        self.tau = tau
        self.agent_id = agent_id

        self.batch_size = batch_size
        self.update_frequency = update_frequency

        self.critic_grad_norm_clip = critic_grad_norm_clip
        self.policy_update_frequency = policy_update_frequency

        self.policy = policy

        self.homogeneous_agents = homogeneous_agents

        # critic local and target network (Q-Learning)
        if self.homogeneous_agents and MADDPGAgent.online_critic is None:
            MADDPGAgent.online_critic = critic_factory().to(device).float()

            MADDPGAgent.target_critic = critic_factory().to(device).float()
            MADDPGAgent.target_critic.load_state_dict(
                self.online_critic.state_dict())

            # actor local and target network (Policy gradient)
            MADDPGAgent.online_actor = actor_factory().to(device).float()
            MADDPGAgent.target_actor = actor_factory().to(device).float()
            MADDPGAgent.target_actor.load_state_dict(
                self.online_actor.state_dict())

            # optimizer for critic and actor network
            MADDPGAgent.critic_optimizer = critic_optimizer_factory(
                self.online_critic.parameters())
            MADDPGAgent.actor_optimizer = actor_optimizer_factory(
                self.online_actor.parameters())

            self.online_critic = MADDPGAgent.online_critic
            self.target_critic = MADDPGAgent.target_critic

            # actor local and target network (Policy gradient)
            self.online_actor = MADDPGAgent.online_actor
            self.target_actor = MADDPGAgent.target_actor

            # optimizer for critic and actor network
            self.critic_optimizer = MADDPGAgent.critic_optimizer
            self.actor_optimizer = MADDPGAgent.actor_optimizer
        else:
            self.online_critic = critic_factory().to(device).float()
            self.target_critic = critic_factory().to(device).float()
            self.target_critic.load_state_dict(self.online_critic.state_dict())

            # actor local and target network (Policy gradient)
            self.online_actor = actor_factory().to(device).float()
            self.target_actor = actor_factory().to(device).float()
            self.target_actor.load_state_dict(self.online_actor.state_dict())

            # optimizer for critic and actor network
            self.critic_optimizer = critic_optimizer_factory(
                self.online_critic.parameters())
            self.actor_optimizer = actor_optimizer_factory(
                self.online_actor.parameters())
        self.memory = memory_factory()