예제 #1
0
 def save(self):
     """
     Save a model. Used by train at the interval specified by save_interval.
     """
     path = f"models/{self.model_name}.pt"
     save(self.model, path)
     debug(f"model saved in {path}")
예제 #2
0
 def load(self):
     """
     Load a model. Used by visualize to load a trained model.
     """
     files = [
         f.rpartition(".pt")[0] for f in os.listdir("models")
         if f != ".gitignore"
     ]
     if self.model_name not in files:
         valid_model_names = ", ".join(files)
         raise AlgorithmError(
             f"Choose a valid model name: {valid_model_names}")
     path = f"models/{self.model_name}.pt"
     debug(f"model loaded from {path}")
     return load(path)
예제 #3
0
파일: drqn.py 프로젝트: ollema/purl
    def push(self, episode, reward, epsilon):
        eps = (episode - self.current_episode) / (time.time() - self.current_time)
        self.current_episode = episode
        self.current_time = time.time()

        self.total_rewards.append(reward)
        mean_reward = np.mean(self.total_rewards[-100:])

        self.writer.add_scalar("epsilon", epsilon, episode)
        self.writer.add_scalar("episodes_per_second", eps, episode)
        self.writer.add_scalar("reward_avg_100", mean_reward, episode)
        self.writer.add_scalar("reward", reward, episode)

        if episode % self.debug_every == 0:
            debug(f"episode {episode:6d} finished - avg. reward: {mean_reward:2f}")
예제 #4
0
    def visualize(self):
        self.model = self.load()
        self.acmodel = self.model["acmodel"]
        self.acmodel.eval()

        env = gym.make(self.env.spec.id)
        env.seed(self.seed)
        self.env = SubprocVecEnv([env])

        self.obs = self.env.reset()
        done = False

        while True:
            action, _, _ = self.select_action(self.obs)
            obs, reward, done, _ = self.env.step(action.cpu().numpy())
            self.obs = obs
            self.env.render()
            time.sleep(1 / self.fps)

            if done[0]:
                debug(f"reward: {reward[0]}")
예제 #5
0
파일: main.py 프로젝트: ollema/purl
def run(action, args):
    algorithms = {}
    debug("algorithms detected:")
    for algorithm in ReinforcementLearningAlgorithm.subclasses:
        debug(algorithm.__name__)
        algorithms.update({algorithm.__name__: algorithm})
    debug("")

    try:
        if args.algorithm not in algorithms:
            valid_algorithms = ", ".join(algorithms.keys())
            error(f"choose a valid algorithm: {valid_algorithms}")
            return 1
        try:
            env = gym.make(args.environment)
        except gym.error.Error:
            valid_environments = "\n".join([
                env.id for env in gym.envs.registry.all()
                if env.id.startswith("MiniGrid")
            ])
            error(f"choose a valid gym enviroment:\n{valid_environments}")
            return 1

        algo = algorithms[args.algorithm](env=env, args=args)
        with logger.catch(reraise=True):
            if action == "train":
                algo.train()
            if action == "visualize":
                algo.visualize()

    except AlgorithmError as e:
        error(e.msg)
        return 1

    except Exception as e:
        error(e)
        return 1
    return 0
예제 #6
0
 def __exit__(self, *args):
     debug("closing tensorboard")
     self.writer.close()
예제 #7
0
    def train(self):
        Q = self.model["q_table"]

        eps = self.start_eps
        rewards = []

        for i in range(1, self.max_num_updates + 1):
            # reduce chance for random action
            if eps > self.end_eps:
                eps -= self.eps_decay

            if self.seed:
                self.env.seed(self.seed)
            obs = self.env.reset()
            obs = preprocess_obs(obs, self.q_table_length,
                                 self.discrete_obs_space)

            current_reward = 0
            done = False

            while True:
                # get q values
                q = Q[obs, :]

                # greedy-epsilon
                if np.random.rand(1) < eps:
                    # sample random action from action space
                    a = self.env.action_space.sample()
                else:
                    # choose action with highest Q value
                    a = np.argmax(q)

                # get next observation, reward and done from environment
                next_obs, reward, done, _ = self.env.step(a)
                next_obs = preprocess_obs(next_obs, self.q_table_length,
                                          self.discrete_obs_space)

                # construct a target
                next_q_max = np.max(Q[next_obs, :])
                target_q = next_q_max * self.y + reward

                # update q-table with new knowledge
                Q[obs, a] = (1 - self.lr) * Q[obs, a] + self.lr * target_q

                # update variables for next iteration
                current_reward += reward
                obs = next_obs

                if self.render_interval != 0 and i % self.render_interval == 0:
                    self.env.render()
                    time.sleep(1 / self.fps)

                if done:
                    break

            rewards.append(current_reward)

            if i % 100 == 0 and i != 0:
                debug(
                    f"episode {i:5d} finished - avg. reward: {np.average(rewards[-100:-1]):2f}"
                )

            if self.save_interval != 0 and i % self.save_interval == 0:
                self.save()

        success(f"all {self.max_num_updates:5d} episodes finished!")
        info(f"reward for the final episode: {rewards[-1]:2f}")

        if self.save_interval != 0:
            self.save()

        debug("plotting reward over episodes")
        matplotlib.rcParams["figure.dpi"] = 100
        plt.plot(rewards)
        plt.plot(savgol_filter(rewards, 23, 3), "-r", linewidth=2.0)
        plt.title(self.model_name)
        plt.xlabel("episode")
        plt.ylabel("reward")
        plt.show()
예제 #8
0
파일: q_network.py 프로젝트: ollema/purl
    def train(self):
        q_net = self.model["q_network"]
        q_net.train()

        # loss function, could experiment with alternatives like Huber loss (F.smooth_l1_loss) too
        criterion = F.mse_loss
        # optimizer, could experiment with alternatives like AdaBound (adabound.AdaBound) too
        optimizer = optim.SGD(q_net.parameters(), lr=self.lr)

        eps = self.start_eps
        rewards = []

        for i in range(1, self.max_num_updates + 1):
            # reduce chance for random action
            if eps > self.end_eps:
                eps -= self.eps_decay

            if self.seed:
                self.env.seed(self.seed)
            obs = self.env.reset()
            obs = preprocess_obs(obs, self.in_features,
                                 self.discrete_obs_space)

            current_reward = 0
            done = False

            while True:
                # get q values
                q = q_net(obs.unsqueeze(0))

                # greedy-epsilon
                if np.random.rand(1) < eps:
                    # sample random action from action space
                    a = self.env.action_space.sample()
                else:
                    with torch.no_grad():
                        # choose action with highest Q value
                        a = q.argmax().item()

                # get next observation, reward and done from environment
                next_obs, reward, done, _ = self.env.step(a)
                next_obs = preprocess_obs(next_obs, self.in_features,
                                          self.discrete_obs_space)

                # construct a target (compare this to a label in supervised learning) by taking
                # our current q values and replacing the q value for the action chosen with:
                # the max q value in the next observation * discount factor + the reward
                next_q = q_net(next_obs.unsqueeze(0))
                next_q_max = next_q.max().item()
                target_q = q.detach().clone()  # clone an independant
                target_q[0, a] = next_q_max * self.y + reward

                # compute loss
                loss = criterion(q, target_q)

                # optimize: backprop and update weights
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # update variables for next iteration
                current_reward += reward
                obs = next_obs

                if self.render_interval != 0 and i % self.render_interval == 0:
                    self.env.render()
                    time.sleep(1 / self.fps)

                if done:
                    break

            rewards.append(current_reward)

            if i % 100 == 0:
                debug(
                    f"episode {i:5d} finished - avg. reward: {np.average(rewards[-100:-1]):2f}"
                )

            if self.save_interval != 0 and i % self.save_interval == 0:
                self.save()

        success(f"all {self.max_num_updates:5d} episodes finished!")
        info(f"reward for the final episode: {rewards[-1]:2f}")

        if self.save_interval != 0:
            self.save()

        debug("plotting reward over episodes")
        matplotlib.rcParams["figure.dpi"] = 200
        plt.plot(rewards)
        plt.plot(savgol_filter(rewards, 23, 3), "-r", linewidth=2.0)
        plt.title(self.model_name)
        plt.xlabel("episode")
        plt.ylabel("reward")
        plt.show()
예제 #9
0
파일: main.py 프로젝트: ollema/purl
def main():
    p = argparse.ArgumentParser()
    p.add_argument("--log-time-stamps", action="store_true", default=False)

    subp = p.add_subparsers(dest="subcmd_name")

    p_train = subp.add_parser("train",
                              formatter_class=argparse.RawTextHelpFormatter)
    p_train.add_argument(
        "--algorithm",
        type=str,
        required=True,
        metavar="algo",
        help="str:   reinforcement learning algorithm algo to use.",
    )
    p_train.add_argument(
        "--environment",
        type=str,
        required=True,
        metavar="env",
        help="str:   minigrid environment env to use.",
    )
    p_train.add_argument(
        "--learning-rate",
        type=float,
        default=None,
        metavar="α",
        help="float: learning rate α to use.",
    )
    p_train.add_argument(
        "--discount-factor",
        type=float,
        default=None,
        metavar="γ",
        help="float: discount factor γ to use.",
    )
    p_train.add_argument(
        "--start-eps",
        type=float,
        default=None,
        metavar="se",
        help="float: anneal epsilon used in greedy-epsilon from se.",
    )
    p_train.add_argument(
        "--end-eps",
        type=float,
        default=None,
        metavar="ee",
        help="float: anneal epsilon used in greedy-epsilon to ee.",
    )
    p_train.add_argument(
        "--annealing-steps",
        type=float,
        default=None,
        metavar="as",
        help="float: decay epsilon over as steps.",
    )
    p_train.add_argument(
        "--updates",
        type=int,
        default=None,
        metavar="n",
        help="int:   train model for up to n updates",
    )
    p_train.add_argument(
        "--render-interval",
        type=int,
        default=None,
        metavar="i",
        help="int:   if i > 0, render every i:th episode",
    )
    p_train.add_argument(
        "--save-interval",
        type=int,
        default=None,
        metavar="j",
        help="int:   if j > 0, save model every j:th episode",
    )
    p_train.add_argument(
        "--model-name",
        type=str,
        default=None,
        metavar="name",
        help=
        "str:   save model as models/<name>.pt when (if) the model is saved",
    )
    p_train.add_argument("--seed",
                         type=int,
                         default=None,
                         metavar="seed",
                         help="int:   seed used for all randomness")
    p_train.add_argument(
        "--fps",
        type=int,
        default=None,
        metavar="fps",
        help="int:   rendering delay = 1/fps + time to compute next action",
    )
    p_train.add_argument("--tensorboard",
                         action="store_true",
                         help="bool:  use tensorboard")
    p_train.set_defaults(action="train")

    p_visualize = subp.add_parser("visualize")
    p_visualize.add_argument(
        "--algorithm",
        type=str,
        required=True,
        metavar="algo",
        help="str:   reinforcement learning algorithm algo to use.",
    )
    p_visualize.add_argument(
        "--environment",
        type=str,
        required=True,
        metavar="env",
        help="str:   minigrid environment env to use.",
    )
    p_visualize.add_argument(
        "--model-name",
        type=str,
        default=None,
        metavar="name",
        help="str:   load model from models/<name>.pt",
    )
    p_visualize.add_argument("--seed",
                             type=int,
                             default=None,
                             metavar="seed",
                             help="int:   seed used for all randomness")
    p_visualize.add_argument(
        "--fps",
        type=int,
        default=None,
        metavar="fps",
        help="int:   rendering delay = 1/fps + time to compute next action",
    )
    p_visualize.set_defaults(action="visualize")

    args = p.parse_args()

    fmt = get_format(args.log_time_stamps)
    config = {"handlers": [{"sink": stderr, "format": fmt}]}
    logger.configure(**config)

    if not hasattr(args, "action"):
        error("You need to select a subcommand {train, visualize}")
        info("\n" + p_train.format_usage() + p_visualize.format_usage())
        return 1
    try:
        result = run(args.action, args)

        debug(f"{args.subcmd_name} returned {result}")
    except KeyboardInterrupt:
        error("Interrupted by user")
        return 1
    return result