def main(): with open(args.model_dir + "/config.p", "rb") as f: checkpoint_config = pickle.load(f) print("{}\n{}\n{}".format("#" * 80, "CHECKPOINT CONFIG", "#" * 80)) print_config(checkpoint_config) transfer_config = produce_transfer_config(checkpoint_config) print("{}\n{}\n{}".format("*" * 80, "FOR TRANSFER", "*" * 80)) print_config(transfer_config) print("{}\n{}\n{}".format("#" * 80, "CHECKPOINT CONFIG", "#" * 80)) if args.device >= 0: transfer_config.training.device = args.device else: transfer_config.training.device = "cpu" if transfer_config.algorithm.name == "PPO": agent = PPO(transfer_config) elif transfer_config.algorithm.name == "PPOC": agent = PPOC(transfer_config) else: raise ValueError("Unknown model type") checkpoint = torch.load( os.path.join( args.model_dir, "checkpoints", "episode_{}".format(args.episode) ) ) agent.policy.load_state_dict(checkpoint["policy"]) agent.train()
def agent_discrete(): agent = PPO(MultiEnv('CartPole-v1', 4), normalize_state=False, normalize_reward=False, model_factory=MLP.factory(), curiosity_factory=NoCuriosity.factory(), reward=GeneralizedRewardEstimation(gamma=0.99, lam=0.95), advantage=GeneralizedAdvantageEstimation(gamma=0.99, lam=0.95), learning_rate=5e-3, clip_range=0.2, v_clip_range=0.2, c_entropy=1e-2, c_value=0.5, n_mini_batches=4, n_optimization_epochs=5, clip_grad_norm=0.5) agent.to(torch.device('cpu'), torch.float32, np.float32) return agent
def agent_continuous(): agent = PPO( MultiEnv('Pendulum-v0', 10), normalize_state=True, normalize_reward=True, model_factory=MLP.factory(), curiosity_factory=ICM.factory(MlpICMModel.factory(), policy_weight=1, reward_scale=0.01, weight=0.2, intrinsic_reward_integration=0.01), # curiosity_factory=NoCuriosity.factory(), reward=GeneralizedRewardEstimation(gamma=0.95, lam=0.1), advantage=GeneralizedAdvantageEstimation(gamma=0.95, lam=0.1), learning_rate=4e-4, clip_range=0.3, v_clip_range=0.3, c_entropy=1e-2, c_value=0.5, n_mini_batches=32, n_optimization_epochs=10, clip_grad_norm=0.5) agent.to(torch.device('cpu'), torch.float32, np.float32) return agent
def __init__(self, config): self.config = config self.n_eval_steps = config.n_eval_steps with open(config.model_dir + "/config.p", "rb") as f: self.checkpoint_config = pickle.load(f) self.checkpoint_config.eval = config self.checkpoint_config.training.max_episodes = config.max_episodes self.checkpoint_config.training.max_episode_length = config.max_episode_length self.checkpoint_config.experiment.render = config.render self.checkpoint_config.experiment.save_episode_data = config.save_episode_data self.checkpoint_config.experiment.log_interval = 1 self.checkpoint_config.experiment.num_steps_between_plot = 1 self.checkpoint_config.experiment.every_n_episodes = 1 self.checkpoint_config.training.update_every = config.n_eval_steps print(self.checkpoint_config.algorithm.name) if self.checkpoint_config.algorithm.name == "PPO": self.model = PPO(self.checkpoint_config) elif self.checkpoint_config.algorithm.name == "PPOC": self.model = PPOC(self.checkpoint_config) else: return ValueError("Unknown model type") if config.device >= 0: self.model.device = config.device else: self.model.device = "cpu" checkpoint = torch.load( os.path.join( config.model_dir, "checkpoints", "episode_{}".format(config.episode) ) ) self.model.policy.load_state_dict(checkpoint["policy"]) self.model.logger.logdir += "evaluate/" self.model.logger.episodedir = self.model.logger.logdir + "episodes/" os.makedirs(self.model.logger.logdir) if self.config.save_episode_data: os.makedirs(self.model.logger.episodedir)
from curiosity import NoCuriosity from envs import MultiEnv from models import MLP from reporters import TensorBoardReporter from rewards import GeneralizedAdvantageEstimation, GeneralizedRewardEstimation if __name__ == '__main__': device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') reporter = TensorBoardReporter() agent = PPO(MultiEnv('CartPole-v1', 4, reporter), reporter=reporter, normalize_state=False, normalize_reward=False, model_factory=MLP.factory(), curiosity_factory=NoCuriosity.factory(), reward=GeneralizedRewardEstimation(gamma=0.99, lam=0.95), advantage=GeneralizedAdvantageEstimation(gamma=0.99, lam=0.95), learning_rate=5e-3, clip_range=0.2, v_clip_range=0.3, c_entropy=1e-2, c_value=0.5, n_mini_batches=4, n_optimization_epochs=5, clip_grad_norm=0.5) agent.to(device, torch.float32, np.float32) agent.learn(epochs=200, n_steps=500) agent.eval(n_steps=500, render=True)
if __name__ == '__main__': device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') reporter = TensorBoardReporter() agent = PPO(MultiEnv('Pendulum-v0', 10, reporter), reporter=reporter, normalize_state=True, normalize_reward=True, model_factory=MLP.factory(), curiosity_factory=ICM.factory( MlpICMModel.factory(), policy_weight=1, reward_scale=0.01, weight=0.2, intrinsic_reward_integration=0.01, reporter=reporter), reward=GeneralizedRewardEstimation(gamma=0.95, lam=0.15), advantage=GeneralizedAdvantageEstimation(gamma=0.95, lam=0.15), learning_rate=4e-4, clip_range=0.3, v_clip_range=0.5, c_entropy=1e-2, c_value=0.5, n_mini_batches=32, n_optimization_epochs=10, clip_grad_norm=0.5) agent.to(device, torch.float32, np.float32) agent.learn(epochs=30, n_steps=200) agent.eval(n_steps=600, render=True)
matching_actions = [action for action in self.actions if action.name == name] assert len(matching_actions) == 1, "Exactly 1 action must match the given action name" matching_action = matching_actions[0] return matching_action def initialize(self, combat_handler): self.strategy.initialize(creature=self, combat_handler=combat_handler) # Todo: Move into DB vampire = Creature( player=dungeon_master, name="Strahd", hit_points=200, armor_class=17, actions=[MoveLeft(), MoveRight(), MoveUp(), MoveDown(), vampire_bite], location=np.array([5, 5]), symbol="@", strategy=RandomStrategy() ) leotris = Creature( player=hayden, name="Leotris", hit_points=25, armor_class=16, actions=[MoveLeft(), MoveRight(), MoveUp(), MoveDown(), arrow_shot], location=np.array([5, 10]), symbol="x", strategy=PPO() )