Exemplo n.º 1
0
    def __init__(self, config: Dict[str, Any]):
        super().__init__()

        default_config = {
            "num_subgoals": 2,
            "emb_size": 4,
            "rel_hiddens": (16, 16, ),
            "mlp_hiddens": (16, ),
            "activation": "leaky_relu"
        }

        self.config = with_default_config(config, default_config)

        self.activation: Callable[[Tensor], Tensor] = get_activation(self.config["activation"])

        self.own_embedding = nn.Parameter(torch.randn(self.config["emb_size"])/10., requires_grad=True)
        self.agent_embedding = nn.Parameter(torch.randn(self.config["emb_size"])/10., requires_grad=True)
        self.subgoal_embedding = nn.Parameter(torch.randn(self.config["emb_size"])/10., requires_grad=True)
        self.goal_embedding = nn.Parameter(torch.randn(self.config["emb_size"])/10., requires_grad=True)

        rel_sizes = (2 * (self.config["emb_size"] + 3), ) + self.config["rel_hiddens"]
        mlp_sizes = (self.config["rel_hiddens"][-1], ) + self.config["mlp_hiddens"]

        self.relation_layers = nn.ModuleList([
            nn.Linear(in_size, out_size)
            for in_size, out_size in zip(rel_sizes, rel_sizes[1:])
        ])

        self.mlp_layers = nn.ModuleList([
            nn.Linear(in_size, out_size)
            for in_size, out_size in zip(mlp_sizes, mlp_sizes[1:])
        ])
Exemplo n.º 2
0
    def __init__(self, config: Dict):
        super().__init__(config)

        default_config = {
            "input_shape": (100, 100),
            "num_actions": 5,
            "activation": "relu",
        }
        self.config = with_default_config(config, default_config)
        self.activation = get_activation(self.config["activation"])

        input_shape: Tuple[int, int] = self.config["input_shape"]

        self.conv_layers = nn.ModuleList([
            nn.Conv2d(4, 32, kernel_size=8, stride=4),  # 24x24x32
            nn.Conv2d(32, 64, kernel_size=7, stride=3),  # 6x6x64
            nn.Conv2d(64, 64, kernel_size=3, stride=1)
        ])  # 4x4x64

        _coords_i = torch.linspace(-1, 1, input_shape[0]).view(-1, 1).repeat(
            1, input_shape[1])
        _coords_j = torch.linspace(-1, 1, input_shape[1]).view(1, -1).repeat(
            input_shape[0], 1)
        self.coords = torch.stack([_coords_i, _coords_j])

        # flatten

        self.policy_head = nn.Linear(4 * 4 * 64, self.config["num_actions"])
        self.value_head = nn.Linear(4 * 4 * 64, 1)
Exemplo n.º 3
0
    def __init__(self, agent: Agent, env: UnityEnvironment, config: Dict[str,
                                                                         Any]):
        super().__init__(agent, env, config)

        default_config = {
            "steps": 2048,

            # Tensorboard settings
            "tensorboard_name": None,  # str, set explicitly

            # PPO
            "ppo_config": {
                # GD settings
                "optimizer": "adam",
                "optimizer_kwargs": {
                    "lr": 1e-4,
                    "betas": (0.9, 0.999),
                    "eps": 1e-7,
                    "weight_decay": 0,
                    "amsgrad": False
                },
                "gamma": .99,  # Discount factor

                # PPO settings
                "ppo_steps":
                25,  # How many max. gradient updates in one iterations
                "eps": 0.1,  # PPO clip parameter
                "target_kl": 0.01,  # KL divergence limit
                "value_loss_coeff": 0.1,
                "entropy_coeff": 0.1,
                "max_grad_norm": 0.5,

                # Backpropagation settings
                "use_gpu": False,
            }
        }

        self.config = with_default_config(config, default_config)

        self.collector = Collector(agent=self.agent, env=self.env)
        self.ppo = PPOptimizer(agent=agent, config=self.config["ppo_config"])

        # Setup tensorboard
        self.writer: SummaryWriter
        if self.config["tensorboard_name"]:
            dt_string = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
            self.path = Path.home(
            ) / "drlnd_logs" / f"{self.config['tensorboard_name']}_{dt_string}"
            self.writer = SummaryWriter(str(self.path))

            # Log the configs
            with open(str(self.path / "trainer_config.json"), "w") as f:
                json.dump(self.config, f)

            with open(str(self.path / f"agent_config.json"), "w") as f:
                json.dump(self.agent.model.config, f)

            self.path = str(self.path)
        else:
            self.writer = None
Exemplo n.º 4
0
    def __init__(self, config: Dict):
        super().__init__(config)

        default_config = {
            "input_size": 15,
            "num_actions": 5,
            "hidden_sizes": (64, 64),
            "activation": "leaky_relu",
        }
        self.config = with_default_config(config, default_config)

        input_size: int = self.config.get("input_size")
        num_actions: int = self.config.get("num_actions")
        hidden_sizes: Tuple[int] = self.config.get("hidden_sizes")
        self.activation: Callable = get_activation(
            self.config.get("activation"))

        layer_sizes = (input_size, ) + hidden_sizes

        self.hidden_layers = nn.ModuleList([
            nn.Linear(in_size, out_size)
            for in_size, out_size in zip(layer_sizes, layer_sizes[1:])
        ])

        self.policy_head = nn.Linear(layer_sizes[-1], num_actions)
        self.value_head = nn.Linear(layer_sizes[-1], 1)
Exemplo n.º 5
0
    def __init__(self, config: Dict):
        super().__init__(config)

        default_config = {
            "input_shape": (100, 100),
            "num_actions": 5,
            "activation": "relu",
        }

        self.config = with_default_config(config, default_config)

        input_shape: Tuple[int, int] = self.config["input_shape"]
        input_size: int = self.config.get("input_size")
        num_actions: int = self.config.get("num_actions")
        hidden_sizes: Tuple[int] = self.config.get("hidden_sizes")

        self.activation: Callable = get_activation(
            self.config.get("activation"))

        self.conv = nn.Conv2d(3, 3, kernel_size=3, padding=1)

        layer_sizes = (input_size, ) + hidden_sizes

        self.hidden_layers = nn.ModuleList([
            nn.Linear(in_size, out_size)
            for in_size, out_size in zip(layer_sizes, layer_sizes[1:])
        ])

        self.policy_head = nn.Linear(layer_sizes[-1], num_actions)
        self.value_head = nn.Linear(layer_sizes[-1], 1)
Exemplo n.º 6
0
    def __init__(self, config: Dict):
        super().__init__(config)

        torch.manual_seed(0)

        default_config = {
            "input_size": 33,
            "num_actions": 4,
            "activation": "relu",

            "hidden_sizes": (64, 64),
        }
        self.config = with_default_config(config, default_config)

        input_size: int = self.config.get("input_size")
        num_actions: int = self.config.get("num_actions")
        hidden_sizes: Tuple[int] = self.config.get("hidden_sizes")
        self.activation: Callable = get_activation(self.config.get("activation"))

        layer_sizes = (input_size,) + hidden_sizes

        self.hidden_layers = nn.ModuleList([
            nn.Linear(in_size, out_size)
            for in_size, out_size in zip(layer_sizes, layer_sizes[1:])
        ])
        self.policy_mu_head = nn.Linear(layer_sizes[-1], num_actions)

        self.v_hidden_layers = nn.ModuleList([
            nn.Linear(in_size, out_size)
            for in_size, out_size in zip(layer_sizes, layer_sizes[1:])
        ])

        self.std = nn.Parameter(torch.ones(1, num_actions))

        self.value_head = nn.Linear(layer_sizes[-1], 1)
    def __init__(self, agent: Agent,
                 config: Dict[str, Any]):

        self.agent = agent

        default_config = {
            # GD settings
            "optimizer": "adam",
            "optimizer_kwargs": {
                "lr": 1e-3,
                "betas": (0.9, 0.999),
                "eps": 1e-7,
                "weight_decay": 0,
                "amsgrad": False
            },
            "separate_optimizers": False,
            "gamma": 0.95,  # Discount factor

            # "batch_size": 64,
            "minibatches": 32,

            # PPO settings
            "ppo_steps": 5,
            "eps": 0.1,  # PPO clip parameter
            "target_kl": 0.01,  # KL divergence limit
            "value_loss_coeff": 0.1,

            "entropy_coeff": 0.01,
            "entropy_decay_time": 100,  # How many steps to decrease entropy to 0.1 of the original value
            "min_entropy": 0.01,  # Minimum value of the entropy bonus - use this to disable decay

            "max_grad_norm": 0.5,

            # GPU
            "use_gpu": False,
        }
        self.config = with_default_config(config, default_config)

        self.optimizer = get_optimizer(self.config["optimizer"])(agent.model.parameters(),
                                                                 **self.config["optimizer_kwargs"])

        self.gamma: float = self.config["gamma"]
        self.eps: float = self.config["eps"]
Exemplo n.º 8
0
    def __init__(self, config: Dict):
        super().__init__(config)

        default_config = {
            "input_shape": (100, 100),
            "num_actions": 3,
            "activation": "relu",
            "field_threshold": 6,
            "hidden_sizes": (64, 64),
        }

        self.config = with_default_config(config, default_config)
        self.activation = get_activation(self.config["activation"])
        self.field_threshold = self.config["field_threshold"]

        hidden_sizes: Tuple[int] = self.config.get("hidden_sizes")
        input_shape: Tuple[int, int] = self.config["input_shape"]

        _coords_i = torch.linspace(-1, 1, input_shape[0]).view(-1, 1).repeat(
            1, input_shape[1])
        _coords_j = torch.linspace(-1, 1, input_shape[1]).view(1, -1).repeat(
            input_shape[0], 1)
        self.coords = torch.stack([_coords_i, _coords_j])

        self.bilinear = nn.Bilinear(2, 2, 4)
        self.pool1 = nn.AvgPool2d((100, self.field_threshold))
        self.pool2 = nn.AvgPool2d((100, 100 - 2 * self.field_threshold))
        self.pool3 = nn.AvgPool2d((100, self.field_threshold))

        # concat + flatten to [B, 3*4]
        layer_sizes = (12, ) + hidden_sizes

        self.hidden_layers = nn.ModuleList([
            nn.Linear(in_size, out_size)
            for in_size, out_size in zip(layer_sizes, layer_sizes[1:])
        ])

        self.policy_head = nn.Linear(layer_sizes[-1],
                                     self.config["num_actions"])
        self.value_head = nn.Linear(layer_sizes[-1], 1)
Exemplo n.º 9
0
    def __init__(self, agents: Dict[str, Agent], config: Dict[str, Any]):

        self.agents = agents

        default_config = {
            # GD settings
            "optimizer": "adam",
            "optimizer_kwargs": {
                "lr": 1e-3,
                "betas": (0.9, 0.999),
                "eps": 1e-7,
                "weight_decay": 0,
                "amsgrad": False
            },

            # "batch_size": 64,
            "minibatches": 32,

            # PPO settings
            "ppo_steps": 5,
            "eps": 0.1,  # PPO clip parameter
            "target_kl": 0.01,  # KL divergence limit
            "value_loss_coeff": 0.1,
            "entropy_coeff": 0.01,
            "max_grad_norm": 0.5,

            # GPU
            "use_gpu": False,
        }
        self.config = with_default_config(config, default_config)

        self.optimizers = {
            agent_id: get_optimizer(self.config["optimizer"])(
                agent.model.parameters(), **self.config["optimizer_kwargs"])
            for agent_id, agent in self.agents.items()
        }

        self.eps: float = self.config["eps"]
    def __init__(self, agents: Dict[str, Agent], env: UnityEnvironment,
                 config: Dict[str, Any]):
        super().__init__(agents, env, config)

        default_config = {
            "steps": 2000,

            # Tensorboard settings
            "tensorboard_name": None,  # str, set explicitly
            "gamma": .99,  # Discount factor
            "tau": .95,

            # PPO
            "ppo_config": {
                "optimizer": "adam",
                "optimizer_kwargs": {
                    "lr": 1e-3,
                    "betas": (0.9, 0.999),
                    "eps": 1e-7,
                    "weight_decay": 0,
                    "amsgrad": False
                },

                # "batch_size": 64,
                "minibatches": 32,

                # PPO settings
                "ppo_steps": 5,
                "eps": 0.1,  # PPO clip parameter
                "target_kl": 0.01,  # KL divergence limit
                "value_loss_coeff": 0.1,
                "entropy_coeff": 0.01,
                "max_grad_norm": 0.5,

                # GPU
                "use_gpu": False,
            }
        }

        self.config = with_default_config(config, default_config)

        self.collector = Collector(agents=self.agents, env=self.env)
        self.ppo = PPOptimizer(agents=agents, config=self.config["ppo_config"])

        # Setup tensorboard
        self.writer: SummaryWriter
        if self.config["tensorboard_name"]:
            dt_string = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
            self.path = Path.home(
            ) / "drlnd_logs" / f"{self.config['tensorboard_name']}_{dt_string}"
            self.writer = SummaryWriter(str(self.path))

            self.agent_paths = [
                self.path / agent_id for agent_id in self.agents
            ]

            for agent_path in self.agent_paths:
                os.mkdir(str(agent_path))

            # Log the configs
            with open(str(self.path / "trainer_config.json"), "w") as f:
                json.dump(self.config, f)

            with open(str(self.path / f"agent0_config.json"), "w") as f:
                json.dump(self.agents["Agent0"].model.config, f)

            with open(str(self.path / f"agent1_config.json"), "w") as f:
                json.dump(self.agents["Agent1"].model.config, f)

            self.path = str(self.path)
        else:
            self.writer = None
Exemplo n.º 11
0
    def __init__(self, agents: Dict[str, Agent], env: MultiAgentEnv,
                 config: Dict[str, Any]):
        self.agents = agents
        self.agent_ids: List[str] = list(agents.keys())

        self.env = env

        default_config = {
            # Trainer settings
            "agents_to_optimize":
            None,  # ids of agents that should be optimized
            "batch_size":
            10000,  # Number of steps to sample at each iteration, TODO: make it possible to use epochs
            # Agent settings
            "optimizer": "adam",
            "optimizer_kwargs": {
                "lr": 1e-3,
                "betas": (0.9, 0.999),
                "eps": 1e-7,
                "weight_decay": 0,
                "amsgrad": False
            },
            "gamma": 0.95,  # Discount factor
            "preserve_channels": False,

            # PPO settings
            "ppo_steps": 25,
            "eps": 0.1,  # PPO clip parameter
            "target_kl": 0.01,  # KL divergence limit
            "value_loss_coeff": 0.1,
            "entropy_coeff": 0.1,

            # Tensorboard settings
            "tensorboard_name": "test",

            # Compatibility
            "tuple_mode": False,

            # GPU
            "use_gpu": False,
        }
        self.config = with_default_config(config, default_config)

        self.agents_to_optimize: List[str] = self.agent_ids if self.config['agents_to_optimize'] is None \
            else self.config['agents_to_optimize']

        self.optimizers: Dict[str, Optimizer] = {
            agent_id: get_optimizer(self.config["optimizer"])(
                agent.model.parameters(), **self.config["optimizer_kwargs"])
            for agent_id, agent in self.agents.items()
            if agent_id in self.agents_to_optimize
        }

        self.gamma: float = self.config["gamma"]  # TODO use @property instead?
        self.eps: float = self.config["eps"]

        self.writer: SummaryWriter
        if self.config["tensorboard_name"]:
            dt_string = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
            self.path = Path.home(
            ) / "tb_logs" / f"{self.config['tensorboard_name']}_{dt_string}"
            self.writer = SummaryWriter(str(self.path))

            # Log the configs
            with open(str(self.path / "trainer_config.pkl"), "wb") as f:
                pickle.dump(self.config, f)

            for agent_id in self.agent_ids:
                with open(str(self.path / f"{agent_id}_config.pkl"),
                          "wb") as f:
                    pickle.dump(self.agents[agent_id].model.config, f)

            with open(str(self.path / "env_config.pkl"), "wb") as f:
                try:
                    env_config = self.env.config
                    pickle.dump(env_config, f)
                except AttributeError:
                    pass
        else:
            self.writer = None

        self.collector = Collector(agents=self.agents,
                                   env=self.env,
                                   tuple_mode=self.config["tuple_mode"])
Exemplo n.º 12
0
    def train_on_data(self,
                      data_batch: DataBatch,
                      step: int = 0,
                      extra_metrics: Optional[Dict[str, Any]] = None,
                      timer: Optional[Timer] = None):
        """
        Performs a single update step with PPO on the given batch of data.

        Args:
            data_batch: DataBatch, dictionary
            step:
            extra_metrics:
            timer:

        Returns:

        """
        metrics = {}
        if timer is None:
            timer = Timer()
        for agent_id in self.agents_to_optimize:
            agent = self.agents[agent_id]
            optimizer = self.optimizers[agent_id]

            ####################################### Unpack and prepare the data #######################################
            obs_batch = data_batch['observations'][agent_id]
            action_batch = data_batch['actions'][agent_id]
            reward_batch = data_batch['rewards'][agent_id]
            old_logprobs_batch = data_batch['logprobs'][agent_id]
            done_batch = data_batch['dones'][agent_id]

            if self.config["use_gpu"]:
                obs_batch = obs_batch.cuda()
                action_batch = action_batch.cuda()
                old_logprobs_batch = old_logprobs_batch.cuda()
                agent.model.cuda()

            logprob_batch, value_batch, entropy_batch = agent.evaluate_actions(
                obs_batch, action_batch)

            discounted_batch = discount_rewards_to_go(reward_batch, done_batch,
                                                      self.gamma)

            if self.config["use_gpu"]:
                discounted_batch = discounted_batch.cuda()

            advantages_batch = (discounted_batch -
                                value_batch.view(-1)).detach()
            advantages_batch = (advantages_batch - advantages_batch.mean()) / (
                advantages_batch.std() + 1e-6)

            # Initialize metrics
            kl_divergence = 0.
            ppo_step = 0
            value_loss = torch.tensor(0)
            policy_loss = torch.tensor(0)
            loss = torch.tensor(0)
            timer.checkpoint()
            for ppo_step in range(self.config["ppo_steps"]):
                logprob_batch, value_batch, entropy_batch = agent.evaluate_actions(
                    obs_batch, action_batch)

                ######################################### Compute the loss #############################################
                prob_ratio = torch.exp(logprob_batch - old_logprobs_batch)
                surr1 = prob_ratio * advantages_batch
                surr2 = torch.clamp(prob_ratio, 1. - self.eps,
                                    1 + self.eps) * advantages_batch

                kl_divergence = torch.mean(
                    old_logprobs_batch -
                    logprob_batch).item()  # review formula?

                policy_loss = -torch.min(surr1, surr2)
                value_loss = (value_batch.view(-1) - discounted_batch)**2

                loss_batch = (
                    policy_loss.mean() +
                    self.config["value_loss_coeff"] * value_loss.mean() -
                    self.config["entropy_coeff"] * entropy_batch.mean())

                loss = loss_batch.mean()

                ########################################### Update step ###############################################

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                ### Early stopping ###
                if kl_divergence > self.config["target_kl"]:
                    break

            if self.config["use_gpu"]:
                agent.model.cpu()

            metrics[f"{agent_id}/time_update"] = timer.checkpoint()
            metrics[f"{agent_id}/kl_divergence"] = kl_divergence
            metrics[f"{agent_id}/steps_made"] = ppo_step
            metrics[f"{agent_id}/policy_loss"] = policy_loss.mean().item()
            metrics[f"{agent_id}/value_loss"] = value_loss.mean().item()
            metrics[f"{agent_id}/total_loss"] = loss.detach().item()

            ############################################# Collect metrics ############################################

            # Delay by one, so that the new episode starts after a done=True, with a 0 at the beginning
            episode_indices = done_batch.cumsum(dim=0)[:-1]
            episode_indices = torch.cat(
                [torch.tensor([0]),
                 episode_indices])  # [0, 0, 0, ..., 1, 1, ..., 2, ..., ...]

            ep_ids, ep_lens_tensor = torch.unique(episode_indices,
                                                  return_counts=True)
            ep_lens = tuple(ep_lens_tensor)  # tuple of episode lengths

            # Group rewards by episode and sum them up
            ep_rewards = torch.tensor([
                torch.sum(rewards)
                for rewards in torch.split(reward_batch, ep_lens)
            ])

            ### Add new training-based metrics here ###
            metrics[f"{agent_id}/episode_len_mean"] = torch.mean(
                ep_lens_tensor.float()).item()
            metrics[f"{agent_id}/episode_reward_mean"] = torch.mean(
                ep_rewards).item()
            metrics[f"{agent_id}/episode_reward_median"] = torch.median(
                ep_rewards).item()
            metrics[f"{agent_id}/episode_reward_min"] = torch.min(
                ep_rewards).item()
            metrics[f"{agent_id}/episode_reward_max"] = torch.max(
                ep_rewards).item()
            metrics[f"{agent_id}/episode_reward_std"] = torch.std(
                ep_rewards).item()
            metrics[f"{agent_id}/episodes_this_iter"] = len(ep_ids)
            metrics[f"{agent_id}/mean_entropy"] = torch.mean(
                entropy_batch).item()

            metrics[f"{agent_id}/winrate"] = (reward_batch[torch.nonzero(
                done_batch).view(-1)].mean().item() + 1) / 2

            if extra_metrics is not None:
                metrics = with_default_config(
                    metrics,
                    extra_metrics)  # add extra_metrics if not computed here
            self.write_dict(metrics, step)