Пример #1
0
def main():
    with open(args.model_dir + "/config.p", "rb") as f:
        checkpoint_config = pickle.load(f)

        print("{}\n{}\n{}".format("#" * 80, "CHECKPOINT CONFIG", "#" * 80))
        print_config(checkpoint_config)

        transfer_config = produce_transfer_config(checkpoint_config)

        print("{}\n{}\n{}".format("*" * 80, "FOR TRANSFER", "*" * 80))
        print_config(transfer_config)
        print("{}\n{}\n{}".format("#" * 80, "CHECKPOINT CONFIG", "#" * 80))

        if args.device >= 0:
            transfer_config.training.device = args.device
        else:
            transfer_config.training.device = "cpu"

        if transfer_config.algorithm.name == "PPO":
            agent = PPO(transfer_config)
        elif transfer_config.algorithm.name == "PPOC":
            agent = PPOC(transfer_config)
        else:
            raise ValueError("Unknown model type")

        checkpoint = torch.load(
            os.path.join(
                args.model_dir, "checkpoints", "episode_{}".format(args.episode)
            )
        )
        agent.policy.load_state_dict(checkpoint["policy"])

        agent.train()
Пример #2
0
def agent_discrete():
    agent = PPO(MultiEnv('CartPole-v1', 4),
                normalize_state=False,
                normalize_reward=False,
                model_factory=MLP.factory(),
                curiosity_factory=NoCuriosity.factory(),
                reward=GeneralizedRewardEstimation(gamma=0.99, lam=0.95),
                advantage=GeneralizedAdvantageEstimation(gamma=0.99, lam=0.95),
                learning_rate=5e-3,
                clip_range=0.2,
                v_clip_range=0.2,
                c_entropy=1e-2,
                c_value=0.5,
                n_mini_batches=4,
                n_optimization_epochs=5,
                clip_grad_norm=0.5)
    agent.to(torch.device('cpu'), torch.float32, np.float32)
    return agent
Пример #3
0
def agent_continuous():
    agent = PPO(
        MultiEnv('Pendulum-v0', 10),
        normalize_state=True,
        normalize_reward=True,
        model_factory=MLP.factory(),
        curiosity_factory=ICM.factory(MlpICMModel.factory(),
                                      policy_weight=1,
                                      reward_scale=0.01,
                                      weight=0.2,
                                      intrinsic_reward_integration=0.01),
        # curiosity_factory=NoCuriosity.factory(),
        reward=GeneralizedRewardEstimation(gamma=0.95, lam=0.1),
        advantage=GeneralizedAdvantageEstimation(gamma=0.95, lam=0.1),
        learning_rate=4e-4,
        clip_range=0.3,
        v_clip_range=0.3,
        c_entropy=1e-2,
        c_value=0.5,
        n_mini_batches=32,
        n_optimization_epochs=10,
        clip_grad_norm=0.5)
    agent.to(torch.device('cpu'), torch.float32, np.float32)
    return agent
Пример #4
0
    def __init__(self, config):
        self.config = config
        self.n_eval_steps = config.n_eval_steps

        with open(config.model_dir + "/config.p", "rb") as f:
            self.checkpoint_config = pickle.load(f)

        self.checkpoint_config.eval = config
        self.checkpoint_config.training.max_episodes = config.max_episodes
        self.checkpoint_config.training.max_episode_length = config.max_episode_length
        self.checkpoint_config.experiment.render = config.render
        self.checkpoint_config.experiment.save_episode_data = config.save_episode_data
        self.checkpoint_config.experiment.log_interval = 1
        self.checkpoint_config.experiment.num_steps_between_plot = 1
        self.checkpoint_config.experiment.every_n_episodes = 1

        self.checkpoint_config.training.update_every = config.n_eval_steps

        print(self.checkpoint_config.algorithm.name)

        if self.checkpoint_config.algorithm.name == "PPO":
            self.model = PPO(self.checkpoint_config)
        elif self.checkpoint_config.algorithm.name == "PPOC":
            self.model = PPOC(self.checkpoint_config)
        else:
            return ValueError("Unknown model type")

        if config.device >= 0:
            self.model.device = config.device
        else:
            self.model.device = "cpu"

        checkpoint = torch.load(
            os.path.join(
                config.model_dir, "checkpoints", "episode_{}".format(config.episode)
            )
        )

        self.model.policy.load_state_dict(checkpoint["policy"])

        self.model.logger.logdir += "evaluate/"
        self.model.logger.episodedir = self.model.logger.logdir + "episodes/"

        os.makedirs(self.model.logger.logdir)
        if self.config.save_episode_data:
            os.makedirs(self.model.logger.episodedir)
Пример #5
0
from curiosity import NoCuriosity
from envs import MultiEnv
from models import MLP
from reporters import TensorBoardReporter
from rewards import GeneralizedAdvantageEstimation, GeneralizedRewardEstimation

if __name__ == '__main__':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    reporter = TensorBoardReporter()

    agent = PPO(MultiEnv('CartPole-v1', 4, reporter),
                reporter=reporter,
                normalize_state=False,
                normalize_reward=False,
                model_factory=MLP.factory(),
                curiosity_factory=NoCuriosity.factory(),
                reward=GeneralizedRewardEstimation(gamma=0.99, lam=0.95),
                advantage=GeneralizedAdvantageEstimation(gamma=0.99, lam=0.95),
                learning_rate=5e-3,
                clip_range=0.2,
                v_clip_range=0.3,
                c_entropy=1e-2,
                c_value=0.5,
                n_mini_batches=4,
                n_optimization_epochs=5,
                clip_grad_norm=0.5)
    agent.to(device, torch.float32, np.float32)

    agent.learn(epochs=200, n_steps=500)
    agent.eval(n_steps=500, render=True)
Пример #6
0
if __name__ == '__main__':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    reporter = TensorBoardReporter()

    agent = PPO(MultiEnv('Pendulum-v0', 10, reporter),
                reporter=reporter,
                normalize_state=True,
                normalize_reward=True,
                model_factory=MLP.factory(),
                curiosity_factory=ICM.factory(
                    MlpICMModel.factory(),
                    policy_weight=1,
                    reward_scale=0.01,
                    weight=0.2,
                    intrinsic_reward_integration=0.01,
                    reporter=reporter),
                reward=GeneralizedRewardEstimation(gamma=0.95, lam=0.15),
                advantage=GeneralizedAdvantageEstimation(gamma=0.95, lam=0.15),
                learning_rate=4e-4,
                clip_range=0.3,
                v_clip_range=0.5,
                c_entropy=1e-2,
                c_value=0.5,
                n_mini_batches=32,
                n_optimization_epochs=10,
                clip_grad_norm=0.5)
    agent.to(device, torch.float32, np.float32)

    agent.learn(epochs=30, n_steps=200)
    agent.eval(n_steps=600, render=True)
Пример #7
0
        matching_actions = [action for action in self.actions if action.name == name]
        assert len(matching_actions) == 1, "Exactly 1 action must match the given action name"
        matching_action = matching_actions[0]
        return matching_action

    def initialize(self, combat_handler):
        self.strategy.initialize(creature=self, combat_handler=combat_handler)


# Todo: Move into DB
vampire = Creature(
    player=dungeon_master,
    name="Strahd",
    hit_points=200,
    armor_class=17,
    actions=[MoveLeft(), MoveRight(), MoveUp(), MoveDown(), vampire_bite],
    location=np.array([5, 5]),
    symbol="@",
    strategy=RandomStrategy()
)

leotris = Creature(
    player=hayden,
    name="Leotris",
    hit_points=25,
    armor_class=16,
    actions=[MoveLeft(), MoveRight(), MoveUp(), MoveDown(), arrow_shot],
    location=np.array([5, 10]),
    symbol="x",
    strategy=PPO()
)