def test(env_name, episodes, params, render): # Create a Gym environment env = gym.make(env_name) # Get dimensionalities of actions and observations action_space_dim = env.action_space.shape[-1] observation_space_dim = env.observation_space.shape[-1] # Instantiate agent and its policy policy = Policy(observation_space_dim, action_space_dim) policy.load_state_dict(params) agent = Agent(policy) test_reward, test_len = 0, 0 for ep in range(episodes): done = False observation = env.reset() while not done: # Similar to the training loop above - # get the action, act on the environment, save total reward # (evaluation=True makes the agent always return what it thinks to be # the best action - there is no exploration at this point) action, _ = agent.get_action(observation, evaluation=True) observation, reward, done, info = env.step( action.detach().cpu().numpy()) if render: env.render() test_reward += reward test_len += 1 print("Average test reward:", test_reward / episodes, "episode length:", test_len / episodes)
def main(args): # Create a Gym environment env = gym.make(args.env) # Exercise 1 # TODO: For CartPole-v0 - maximum episode length env._max_episode_steps = 1000 # Get dimensionalities of actions and observations action_space_dim = get_space_dim(env.action_space) observation_space_dim = get_space_dim(env.observation_space) # Instantiate agent and its policy policy = Policy(observation_space_dim, action_space_dim) agent = Agent(policy) # Print some stuff print("Environment:", args.env) print("Training device:", agent.train_device) print("Observation space dimensions:", observation_space_dim) print("Action space dimensions:", action_space_dim) # If no model was passed, train a policy from scratch. # Otherwise load the policy from the file and go directly to testing. if args.test is None: training_history = train(args.position, agent, env, args.train_episodes, False, args.render_training) # Save the model tt = str(datetime.datetime.now().date()) + "-" + str( datetime.datetime.now().hour) + "-" + str( datetime.datetime.now().minute) model_file = "%s_params.mdl" % (args.env + tt + "vel") torch.save(policy.state_dict(), model_file) print("Model saved to", model_file) # Plot rewards sns.lineplot(x="episode", y="reward", data=training_history) sns.lineplot(x="episode", y="mean_reward", data=training_history) plt.legend(["Reward", "100-episode average"]) plt.title("Reward history (%s)" % args.env) # time and day of plot plt.savefig("train_history" + tt + "vel" + ".jpg") plt.show() print("Training finished.") else: print("Loading model from", args.test, "...") state_dict = torch.load(args.test) policy.load_state_dict(state_dict) print("Testing...") test(args.position, agent, env, args.train_episodes, args.render_test)
def main(args): # Create a Gym environment env = gym.make(args.env) # Exercise 1 env._max_episode_steps = args.episode_length # Get dimensionalities of actions and observations action_space_dim = get_space_dim(env.action_space) observation_space_dim = get_space_dim(env.observation_space) # Instantiate agent and its policy policy = Policy(observation_space_dim, action_space_dim) agent = Agent(policy) # Print some stuff print("Environment:", args.env) print("Training device:", agent.train_device) print("Observation space dimensions:", observation_space_dim) print("Action space dimensions:", action_space_dim) # If no model was passed, train a policy from scratch. # Otherwise load the policy from the file and go directly to testing. if args.test is None: training_history = train(agent, env, args.train_episodes, False, args.render_training, x0=args.x0, args=args, policy=policy) # Save the model model_file = "%s_params.mdl" % args.env torch.save(policy.state_dict(), model_file) print("Model saved to", model_file) # Plot rewards sns.lineplot(x="episode", y="reward", data=training_history) sns.lineplot(x="episode", y="mean_reward", data=training_history) plt.legend(["Reward", "100-episode average"]) plt.title("Reward history (%s)" % args.env) plt.show() print("Training finished.") else: print("Loading model from", args.test, "...") state_dict = torch.load(args.test) policy.load_state_dict(state_dict) print("Testing...") test(agent, env, args.train_episodes, args.render_test, x0=args.x0)
def trainer(fargs): trainer_id, args = fargs print("Trainer id", trainer_id, "started") # Create a Gym environment env = gym.make(args.env) # Set maximum episode length if args.episode_steps is not None: env._max_episode_steps = args.episode_steps # Get dimensionalities of actions and observations action_space_dim = get_space_dim(env.action_space) observation_space_dim = get_space_dim(env.observation_space) # Instantiate agent and its policy policy = Policy(observation_space_dim, action_space_dim) agent = Agent(policy) training_history = train(agent, env, args.train_episodes, silent=True, train_run_id=trainer_id, early_stop=False) print("Trainer id", trainer_id, "finished") return training_history
def __init__(self): #Preparing envs self.envs = Envs() self.memory = ReplayBuffer() self.device = torch.device(settings.device) self.policy = Policy().to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=p.lr) self.critic = QNetwork().to(self.device) self.critic_target = QNetwork().to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=p.lr) self.parameter_update(tau=1.0) if settings.mode == "test": self.policy.load_state_dict( torch.load("policy_seed_{}".format(settings.seed))) self.logger = Logger()
def main(load_path, num_episode): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') n_env = 1 env_id = 'Breakout-v0' envs = [make_env(env_id) for _ in range(n_env)] envs = DummyVecEnv(envs) envs = VecToTensor(envs) policy = Policy(84, 84, 4, envs.action_space.n).to(device) policy.load_state_dict(torch.load(load_path, map_location=device)) policy.eval() for i in tqdm(range(num_episode)): obs = envs.reset() total_rewards = 0 while True: action_logits, values = policy(obs) actions = choose_action(action_logits) next_obs, rewards, dones, info = envs.step(actions) total_rewards += rewards envs.render() if dones: break print('--------------------' + str(total_rewards.item()) + '-------------------') envs.close()
# handle cli parser = argparse.ArgumentParser(description="evaluate a policy") parser.add_argument("policy_dir", type=str) parser.add_argument("env_name", type=str) parser.add_argument("--atari", action="store_true") parser.add_argument("--runs", type=int, default=10) parser.add_argument("--save", action="store_true") parser.add_argument("--save-to", type=str, default="example.gif", help="save as gif or mp4") parser.add_argument("--fps", type=int, default=24) parser.add_argument("--dpi", type=int, default=72) parser.add_argument("--repeat", type=int, default=3) args = parser.parse_args() # load policy p = Policy(args.policy_dir) # load env env = gym.make(args.env_name) if args.atari: env = AtariWrapper(env) # evalulate history, _ = eval(p, env, args.runs) statistics(history) if not args.save: exit(0) suffix = args.save_to.split(".")[-1] if suffix == "gif":
import pong_utils device = pong_utils.device print("using device: ", device) import gym env = gym.make('PongDeterministic-v4') print("List of available actions: ", env.unwrapped.get_action_meanings()) # The actions 'RIGHTFIRE' = 4 and 'LEFTFIRE" = 5 makes the game restarts if done import matplotlib.pyplot as plt from agent import Policy agent = Policy() agent = agent.to(device) pong_utils.play(env, agent, time=100) envs = pong_utils.parallelEnv('PongDeterministic-v4', n=4, seed=12345) prob, state, action, reward = pong_utils.collect_trajectories(envs, agent, tmax=100)
class Trainer: def __init__(self): #Preparing envs self.envs = Envs() self.memory = ReplayBuffer() self.device = torch.device(settings.device) self.policy = Policy().to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=p.lr) self.critic = QNetwork().to(self.device) self.critic_target = QNetwork().to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=p.lr) self.parameter_update(tau=1.0) if settings.mode == "test": self.policy.load_state_dict( torch.load("policy_seed_{}".format(settings.seed))) self.logger = Logger() def start(self): self.total_numsteps = 0 if settings.mode == "train": self.add_random_steps() names = torch.FloatTensor( [i for i, _ in enumerate(settings.env_names)]).to(self.device) while self.total_numsteps < p.max_numsteps: self.run_test() leg_starts, states = self.envs.reset() for step in range(p._max_episode_steps): self.total_numsteps += 1 actions = self.select_action(leg_starts, states, names) next_states, rewards, dones = self.envs.step(actions) self.memory.push(names, leg_starts, states, next_states, actions, rewards, dones) states = self.envs.reset_dones(next_states, dones) c1_loss, c2_loss, policy_loss = self.update_nets() if (self.total_numsteps % 10) == 0: self.logger.show_update(self.total_numsteps) torch.save(self.policy.state_dict(), "policy_seed_{}".format(settings.seed)) else: print("Seed: {}".format(settings.seed)) self.run_test() def run_test(self): if settings.mode == "test": print("\nTesting current policy") leg_starts, states = self.envs.reset() done_filter = epsd_rewards = torch.FloatTensor( [1.0] * len(settings.env_names)).to(self.device) epsd_rewards = torch.FloatTensor([0.0] * len(settings.env_names)).to( self.device) names = torch.FloatTensor( [i for i, _ in enumerate(settings.env_names)]).to(self.device) for step in range(p._max_episode_steps): actions = self.select_action(leg_starts, states, names, evaluate=True) next_states, rewards, dones = self.envs.step(actions) epsd_rewards += done_filter * rewards done_filter *= (dones != 1).float() states = next_states self.logger.add_rewards(len(names), epsd_rewards, self.total_numsteps) self.logger.save() def add_random_steps(self): print("Adding random steps") leg_starts, states = self.envs.reset() names = torch.FloatTensor( [i for i, _ in enumerate(settings.env_names)]).to(self.device) while len(self.memory) <= p.batch_size * 10: actions = self.envs.sample_actions() next_states, rewards, dones = self.envs.step(actions) self.memory.push(names, leg_starts, states, next_states, actions, rewards, dones) states = self.envs.reset_dones(next_states, dones) def select_action(self, leg_starts, states, names, evaluate=False): with torch.no_grad(): if not evaluate: actions, _, _ = self.policy.sample(leg_starts, states, names) else: _, _, actions = self.policy.sample(leg_starts, states, names) return actions.cpu() def parameter_update(self, tau=p.tau): for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def update_nets(self): names_batch, leg_starts_batch, state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.memory.sample( ) reward_batch = reward_batch.unsqueeze(1) mask_batch = mask_batch.unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( leg_starts_batch, next_state_batch, names_batch) qf1_next_target, qf2_next_target = self.critic_target( leg_starts_batch, next_state_batch, next_state_action, names_batch) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - p.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * p.gamma * ( min_qf_next_target) qf1, qf2 = self.critic(leg_starts_batch, state_batch, action_batch, names_batch) qf1_loss = F.mse_loss(qf1, next_q_value) qf2_loss = F.mse_loss(qf2, next_q_value) qf_loss = qf1_loss + qf2_loss self.critic_optim.zero_grad() qf_loss.backward() self.critic_optim.step() pi, log_pi, _ = self.policy.sample(leg_starts_batch, state_batch, names_batch) qf1_pi, qf2_pi = self.critic(leg_starts_batch, state_batch, pi, names_batch) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((p.alpha * log_pi) - min_qf_pi).mean() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() self.parameter_update() return qf1_loss.item(), qf2_loss.item(), policy_loss.item()
def train(episodes, player, opponent): target_dqn = Policy(observation_space_dim, action_space_dim) target_dqn.load_state_dict(policy.state_dict()) #Stacked preprocessed frames stacked_frames = deque(np.zeros((200, 210)), maxlen=4) #Updates update_counter = 0 #Memory Initialisation # take random actions to fill the memory memory = Memory(memory_size, batch_size) for i in range(memory_size): if (i == 0): obs = env.reset() state, stacked_frames = stack_frame(stacked_frames, obs[0], True) action1 = random.randint(0, 3) action2 = random.randint(0, 3) next_obs, rewards, done, info = env.step((action1, action2)) next_state, stacked_frames = stack_frame(stacked_frames, next_obs[0]) memory.store((state, action1, rewards[0], next_state, done)) state = next_state player.reset_score() opponent.reset_score() ''' Training ''' for i in range(0, episodes): done = False obs = env.reset() state, stacked_frames = stack_frame(stacked_frames, obs[0], True) timesteps = 0 reward_sum = 0 while not done: action1 = player.get_action(state, epsilon) action2 = opponent.get_action() next_obs, rewards, done, info = env.step((action1, action2)) next_state, stacked_frames = stack_frame(stacked_frames, next_obs[0]) memory.store((state, action1, rewards[0], next_state, done)) reward_sum += rewards[0] obs = next_obs state = next_state env.render() #Updating policy #Loading from memory samples = memory.sample() batch_states = np.asarray([x[0] for x in samples]) batch_actions = np.asarray([x[1] for x in samples]) batch_rewards = np.asarray([x[2] for x in samples]) batch_next_states = np.asarray([x[3] for x in samples]) batch_done = np.asarray([x[4] for x in samples]) #Target network batch = torch.from_numpy(batch_next_states.squeeze()).float().to( player.train_device) batch_t_q_values = target_dqn.forward(batch) #Q Learning batch_t_q_max, _ = batch_t_q_values.max(dim=1) y = torch.empty(batch_size, 1) batch_rewards = torch.from_numpy(batch_rewards).float().to( player.train_device) for j in range(batch_size): #.any() ? if batch_done[j].any(): y[j] = batch_rewards[j] else: y[j] = batch_rewards[j] + batch_t_q_max[j].mul(gamma) y.detach() #Gradient_descent batch_q_values = policy.forward( torch.from_numpy(batch_states.squeeze()).float().to( player.train_device)) loss = torch.mean(y.sub(batch_q_values)**2) loss.backward() player.update_policy() update_counter += 1 if (update_counter % update_step == 0): target_dqn.load_state_dict(policy.state_dict()) timesteps += 1 epsilon = epsilon * decay print( "Episode {} finished. Total reward: {:.3g} ({} timesteps)".format( i, reward_sum, timesteps))
def train(env_name, print_things=True, train_run_id=0, train_episodes=5000): # Create a Gym environment env = gym.make(env_name) # Get dimensionalities of actions and observations action_space_dim = env.action_space.shape[-1] observation_space_dim = env.observation_space.shape[-1] # Instantiate agent and its policy policy = Policy(observation_space_dim, action_space_dim) agent = Agent(policy) # Arrays to keep track of rewards reward_history, timestep_history = [], [] average_reward_history = [] # Run actual training for episode_number in range(train_episodes): reward_sum, timesteps = 0, 0 done = False # Reset the environment and observe the initial state observation = env.reset() # Loop until the episode is over while not done: # Get action from the agent action, action_probabilities = agent.get_action(observation) previous_observation = observation # Perform the action on the environment, get new state and reward observation, reward, done, info = env.step(action.detach().numpy()) # Store action's outcome (so that the agent can improve its policy) agent.store_outcome(previous_observation, action_probabilities, action, reward) # Store total episode reward reward_sum += reward timesteps += 1 if print_things: print("Episode {} finished. Total reward: {:.3g} ({} timesteps)". format(episode_number, reward_sum, timesteps)) # Bookkeeping (mainly for generating plots) reward_history.append(reward_sum) timestep_history.append(timesteps) if episode_number > 100: avg = np.mean(reward_history[-100:]) else: avg = np.mean(reward_history) average_reward_history.append(avg) # Let the agent do its magic (update the policy) agent.episode_finished(episode_number) # Training is finished - plot rewards if print_things: plt.plot(reward_history) plt.plot(average_reward_history) plt.legend(["Reward", "100-episode average"]) plt.title("Reward history") plt.savefig("plots/task-2b.png") plt.show() print("Training finished.") data = pd.DataFrame({ "episode": np.arange(len(reward_history)), "train_run_id": [train_run_id] * len(reward_history), # TODO: Change algorithm name for plots, if you want "algorithm": ["PG"] * len(reward_history), "reward": reward_history }) torch.save(agent.policy.state_dict(), "model_%s_%d.mdl" % (env_name, train_run_id)) return data
lr = 0.001 alpha = 0.99 epsilon = 1e-05 env_id = 'Breakout-v0' envs = [make_env(env_id) for _ in range(n_env)] # envs = DummyVecEnv(envs) # envs = SubprocVecEnv(envs) envs = ShmemVecEnv(envs) envs = VecToTensor(envs) date = datetime.now().strftime('%m_%d_%H_%M') mon_file_name = "./tmp/" + date envs = VecMonitor(envs, mon_file_name) train_policy = Policy(84, 84, 4, envs.action_space.n).to(device) step_policy = Policy(84, 84, 4, envs.action_space.n).to(device) step_policy.load_state_dict(train_policy.state_dict()) step_policy.eval() runner = Runner(envs, step_policy, n_step, gamma) optimizer = optim.RMSprop(train_policy.parameters(), lr=lr, alpha=alpha, eps=epsilon) for i in tqdm(range(num_updates)): mb_obs, mb_rewards, mb_values, mb_actions = runner.run() action_logits, values = train_policy(mb_obs)
Initialisation ''' env = Pong(headless=args.headless) #Players player_id = 1 opponent_id = 3 - player_id opponent = PongAi(env, opponent_id) #Model action_space_dim = 3 observation_space_dim = 4 #Classes policy = Policy(observation_space_dim, action_space_dim) player = Agent(env, policy, player_id) env.set_names(player.get_name(), opponent.get_name()) def train(episodes, player, opponent): target_dqn = Policy(observation_space_dim, action_space_dim) target_dqn.load_state_dict(policy.state_dict()) #Stacked preprocessed frames stacked_frames = deque(np.zeros((200, 210)), maxlen=4) #Updates update_counter = 0
def train(env_name, print_things=True, train_run_id=0, train_timesteps=200000, update_steps=50): # Create a Gym environment # This creates 64 parallel envs running in 8 processes (8 envs each) env = ParallelEnvs(env_name, processes=8, envs_per_process=8) # Get dimensionalities of actions and observations action_space_dim = env.action_space.shape[-1] observation_space_dim = env.observation_space.shape[-1] # Instantiate agent and its policy policy = Policy(observation_space_dim, action_space_dim) agent = Agent(policy) # Arrays to keep track of rewards reward_history, timestep_history = [], [] average_reward_history = [] # Run actual training # Reset the environment and observe the initial state observation = env.reset() # Loop forever for timestep in range(train_timesteps): # Get action from the agent action, action_probabilities = agent.get_action(observation) previous_observation = observation # Perform the action on the environment, get new state and reward observation, reward, done, info = env.step(action.detach().numpy()) for i in range(len(info["infos"])): env_done = False # Check if the environment is finished; if so, store cumulative reward for envid, envreward in info["finished"]: if envid == i: reward_history.append(envreward) average_reward_history.append( np.mean(reward_history[-500:])) env_done = True break # Store action's outcome (so that the agent can improve its policy) agent.store_outcome(previous_observation[i], observation[i], action_probabilities[i], reward[i], env_done) if timestep % update_steps == update_steps - 1: print(f"Update @ step {timestep}") agent.update_policy(0) plot_freq = 1000 if timestep % plot_freq == plot_freq - 1: # Training is finished - plot rewards plt.plot(reward_history) plt.plot(average_reward_history) plt.legend(["Reward", "500-episode average"]) plt.title("AC reward history (non-episodic, parallel)") plt.savefig("rewards_%s.png" % env_name) plt.clf() torch.save(agent.policy.state_dict(), "model.mdl") print("%d: Plot and model saved." % timestep) data = pd.DataFrame({ "episode": np.arange(len(reward_history)), "train_run_id": [train_run_id] * len(reward_history), # TODO: Change algorithm name for plots, if you want "algorithm": ["Nonepisodic parallel AC"] * len(reward_history), "reward": reward_history }) torch.save(agent.policy.state_dict(), "model_%s_%d.mdl" % (env_name, train_run_id)) return data
parser.add_argument('--render', action='store_true') args = parser.parse_args() env = Pong(headless=args.headless) player_id = 1 opponent_id = 3 - player_id action_space = 1 UP_ACTION = 1 DOWN_ACTION = 2 episode_n = 1500 # Policy declaration policy1 = Policy(args.hidden_layer_size, action_space) policy2 = Policy(args.hidden_layer_size, action_space) # Agent declaration #opponent = Agent(env, policy1, args.learning_rate, args.discount_factor, opponent_id, 'player 2') opponent = PongAi(env, opponent_id) player = Agent(env, policy2, args.learning_rate, args.discount_factor, player_id, 'player 1') player.load_checkpoint('checkpoint-single-2/checkpoints-player-1500.pth') env.set_names(player.get_name(), opponent.get_name()) # action1 = player.get_action() # action2 = opponent.get_action() # (ob1, ob2), (rew1, rew2), done, info = env.step((action1, action2))
import torch.optim as optim import torch import gym from agent import Policy from collections import deque import numpy as np env = gym.make('CartPole-v1') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") policy = Policy().to(device) optimizer = optim.Adam(policy.parameters(), lr=1e-2) n_episodes = 2000 max_t = 1000 gamma = 1.0 print_every = 100 scores_deque = deque(maxlen=100) scores = [] for i_episode in range(1, n_episodes+1): saved_log_probs = [] rewards = [] state = env.reset() for t in range(max_t): action, log_prob = policy.act(state) saved_log_probs.append(log_prob) state, reward, done, _ = env.step(action) rewards.append(reward) if done:
def __init__(self, gamma, optimizer, modelName, logDir, lr, TAU, ALPHA=0.2, stepToCkpt=50000): self.gamma = gamma self.opt = optimizer self.modelName = modelName self.logDir = logDir self.sumDir = os.path.join(self.logDir, self.modelName + "_summary") self.ckptDir = os.path.join(self.logDir, self.modelName + "_ckpt") self.lr = lr self.TAU = TAU self.ALPHA = ALPHA ### implementation of polyak averaging # self.avg = tf.train.ExponentialMovingAverage(decay = self.TAU) OPTIMIZER = { "sgd": tf.keras.optimizers.SGD(learning_rate=self.lr), "Adam": tf.keras.optimizers.Adam(learning_rate=self.lr), "rmsProp": tf.keras.optimizers.RMSprop(learning_rate=self.lr), "adaGrad": tf.keras.optimizers.Adagrad(learning_rate=self.lr) } LOSS = { "huber": tf.keras.losses.Huber, "mse": tf.keras.losses.MSE, } self.polOpt = OPTIMIZER[self.opt] self.qOpt = OPTIMIZER[self.opt] self.vOpt = tfa.optimizers.MovingAverage(OPTIMIZER[self.opt], average_decay=self.TAU) # self.vOpt = OPTIMIZER[optimizer] ####### network definition ############# self.policy = Policy() self.QSample2 = QValFn() self.QSample1 = QValFn( ) #https://spinningup.openai.com/en/latest/algorithms/sac.html self.ValueFn = ValFn() ############ summary Writer############# self.summary_writer = tf.compat.v2.summary.create_file_writer( os.path.join(self.sumDir, 'logs/'), flush_millis=10000) # self.ckpt_writer = tf.compat.v2.summary.create_file_writer(os.path.join(self.sumDir,'ckpt/'), flush_millis=10000) self.summary_writer.set_as_default() self.global_step = tf.compat.v1.train.get_or_create_global_step() ##### checkpoint writer ########### self.ckpt = tf.train.Checkpoint(policy=self.policy.finalModel, q1=self.QSample1.finalModel, q2=self.QSample2.finalModel, value=self.ValueFn.finalModel, policyOpt=self.polOpt, qOpt=self.qOpt, vOpt=self.vOpt) self.pathCkpt = os.path.join(self.sumDir, 'ckpt') self.ckptManager = tf.train.CheckpointManager(self.ckpt, self.pathCkpt, max_to_keep=3) self.stepToCkpt = stepToCkpt ###### loading the latest checkpoint for the training purpose ########## print(self.pathCkpt) self.ckpt.restore(self.ckptManager.latest_checkpoint) if self.ckptManager.latest_checkpoint: print("Restored from {}".format( self.ckptManager.latest_checkpoint)) else: print("Initializing from scratch.")
class SAC: def __init__(self, gamma, optimizer, modelName, logDir, lr, TAU, ALPHA=0.2, stepToCkpt=50000): self.gamma = gamma self.opt = optimizer self.modelName = modelName self.logDir = logDir self.sumDir = os.path.join(self.logDir, self.modelName + "_summary") self.ckptDir = os.path.join(self.logDir, self.modelName + "_ckpt") self.lr = lr self.TAU = TAU self.ALPHA = ALPHA ### implementation of polyak averaging # self.avg = tf.train.ExponentialMovingAverage(decay = self.TAU) OPTIMIZER = { "sgd": tf.keras.optimizers.SGD(learning_rate=self.lr), "Adam": tf.keras.optimizers.Adam(learning_rate=self.lr), "rmsProp": tf.keras.optimizers.RMSprop(learning_rate=self.lr), "adaGrad": tf.keras.optimizers.Adagrad(learning_rate=self.lr) } LOSS = { "huber": tf.keras.losses.Huber, "mse": tf.keras.losses.MSE, } self.polOpt = OPTIMIZER[self.opt] self.qOpt = OPTIMIZER[self.opt] self.vOpt = tfa.optimizers.MovingAverage(OPTIMIZER[self.opt], average_decay=self.TAU) # self.vOpt = OPTIMIZER[optimizer] ####### network definition ############# self.policy = Policy() self.QSample2 = QValFn() self.QSample1 = QValFn( ) #https://spinningup.openai.com/en/latest/algorithms/sac.html self.ValueFn = ValFn() ############ summary Writer############# self.summary_writer = tf.compat.v2.summary.create_file_writer( os.path.join(self.sumDir, 'logs/'), flush_millis=10000) # self.ckpt_writer = tf.compat.v2.summary.create_file_writer(os.path.join(self.sumDir,'ckpt/'), flush_millis=10000) self.summary_writer.set_as_default() self.global_step = tf.compat.v1.train.get_or_create_global_step() ##### checkpoint writer ########### self.ckpt = tf.train.Checkpoint(policy=self.policy.finalModel, q1=self.QSample1.finalModel, q2=self.QSample2.finalModel, value=self.ValueFn.finalModel, policyOpt=self.polOpt, qOpt=self.qOpt, vOpt=self.vOpt) self.pathCkpt = os.path.join(self.sumDir, 'ckpt') self.ckptManager = tf.train.CheckpointManager(self.ckpt, self.pathCkpt, max_to_keep=3) self.stepToCkpt = stepToCkpt ###### loading the latest checkpoint for the training purpose ########## print(self.pathCkpt) self.ckpt.restore(self.ckptManager.latest_checkpoint) if self.ckptManager.latest_checkpoint: print("Restored from {}".format( self.ckptManager.latest_checkpoint)) else: print("Initializing from scratch.") def policyLoss(self, currentState): ## CHECKED ## define loss function for policy function ## TODO : FORMULATION DOESN"T MATCH THE PAPER _, _, action, mean, sqrtStd, gauss, rewardAction = self.policy.samplePolicy( currentState, training=True) ## TODO : CHECK THE ORDER FROM POLICY NETWORK logPolicy = tf.stop_gradient( self.policy.lgOfPolicy( mean, sqrtStd, gauss)) ## TODO : check implementation here also qVal = self.QSample1.QvalForward(currentState, action, training=False) policyLossOp = tf.reduce_mean(tf.abs(self.ALPHA * logPolicy - qVal)) return policyLossOp def qValLoss(self, Qnetwork, currentState, action, reward, nextState, DONE): ## CHECKED #### NOTE function calculation depend on two key state and action make sure they are consistent#### # part of TODO ^^^ ## define loss function for Q value #TODO : define data structure for current state next state and reward #UPDATE : added few changes in Policy return to be consistent with value and Q function vValNext = self.ValueFn.ValFnForward(nextState, training=False) qVal = Qnetwork.QvalForward( currentState, action, training=True) ## stochastic sampling of state qTarget = reward + self.gamma * ( 1 - DONE) * vValNext ## Question why not use QTarget instead of QVal loss = tf.reduce_mean( tf.pow((qVal - qTarget), 2) ) ## loss is explicitly defined for q based gradient not for value function return loss def vValLoss(self, currentState): ### Checked ## define loss function for value function #### https://spinningup.openai.com/en/latest/algorithms/sac.html value = self.ValueFn.ValFnForward(currentState, training=True) _, _, action, mean, sqrtStd, gauss, rewardAction = self.policy.samplePolicy( currentState, training=False) ## TODO : check the order qVal1 = self.QSample1.QvalForward(currentState, action, training=False) qVal2 = self.QSample2.QvalForward(currentState, action, training=False) qVal = tf.math.minimum(qVal1, qVal2) logPolicy = tf.stop_gradient( self.policy.lgOfPolicy(mean, sqrtStd, gauss)) softValue = tf.reduce_sum(qVal - self.ALPHA * logPolicy) ##TODO : POLYAK averaging return tf.reduce_mean(tf.pow((value - softValue), 2)) def softUpdate(self, locModel, tagModel): """ soft update the model parameters. theta_target = tau*theta_local + (1-tau)theta_target """ #TODO : check if its working or not for targetParam, localParam in zip(tagModel.trainable_variables, locModel.trainable_variables): print("old :", targetParam) targetParam.assign(self.TAU * targetParam + (1 - self.TAU) * localParam) print(print("new :", targetParam)) return def loggingQLoss(self, loss, step): tf.summary.experimental.set_step(step) tf.compat.v2.summary.scalar('qvalue_loss', tf.math.log(loss)) def loggingVLoss(self, loss, step): tf.summary.experimental.set_step(step) tf.compat.v2.summary.scalar('Vvalue_loss', tf.math.log(loss)) def loggingPLoss(self, loss, step): tf.summary.experimental.set_step(step) tf.compat.v2.summary.scalar('policy_loss', tf.math.log(loss)) def loggingReward(self, reward, step): tf.summary.experimental.set_step(step) tf.compat.v2.summary.scalar('reward', reward) def train(self, epState, batchState, batchReward, batchAction, batchNextState, DONE): # print(self.policy.finalModel.summary()) # input() ## training the model ## ttrick to fit model on smaller GPU if ((epState + 1) % self.stepToCkpt == 0): self.saveModel(epState + 1) else: pass if (epState % 3 == 0): ############# ask GSR : better way of regularization ################ # with tf.device('/device:GPU:1'): with tf.GradientTape() as Ptape: lossPolicy = self.policyLoss(batchState) #TODO : modofy the policy model policyGradient = Ptape.gradient( lossPolicy, self.policy.finalModel.trainable_variables) self.polOpt.apply_gradients( zip(policyGradient, self.policy.finalModel.trainable_variables)) self.loggingPLoss(lossPolicy, epState // 3) ### assigning one step for each of the step in policy return lossPolicy, 0.0, 0.0 elif (epState % 3 == 1): # with tf.device('/device:GPU:1'): with tf.GradientTape() as Qtape: countQNet = epState // 2 if countQNet % 2 == 0: Qnetwork = self.QSample1 strQ = 1 else: Qnetwork = self.QSample2 strQ = 2 print("qOpt : ", strQ, countQNet) lossQValue = self.qValLoss(Qnetwork, batchState, batchAction, batchReward, batchNextState, DONE) QGradient = Qtape.gradient( lossQValue, Qnetwork.finalModel.trainable_variables) self.qOpt.apply_gradients( zip(QGradient, Qnetwork.finalModel.trainable_variables)) self.loggingQLoss(lossQValue, epState // 3) return 0.0, lossQValue, 0.0 else: print("vVal") # with tf.device('/device:CPU:0'): with tf.GradientTape() as ValueTape: lossVvalue = self.vValLoss(batchState) ValGradient = ValueTape.gradient( lossVvalue, self.ValueFn.finalModel.trainable_variables) self.loggingVLoss(lossVvalue, epState // 3) self.vOpt.apply_gradients( zip(ValGradient, self.ValueFn.finalModel.trainable_variables)) return 0.0, 0.0, lossVvalue def restoreCkpt(self): ## restor the ckpt self.ckpt.restore(tf.train.latest_checkpoint(self.pathCkpt)) def saveModel(self, epState): ## function to save the checkpoint save_path = self.ckptManager.save(checkpoint_number=epState + 1) print("Saved checkpoint for step {}: {}".format( epState + 1, save_path))
import argparse from agent import Policy, Agent parser = argparse.ArgumentParser() parser.add_argument("--headless", action="store_true", help="Run in headless mode") args = parser.parse_args() env = Pong(headless=args.headless) UP_ACTION = 1 DOWN_ACTION = 2 # Policy declaration policy1 = Policy(500, 1) policy2 = Policy(500, 1) # Agent declaration opponent = Agent(env, policy2, 0.0005, 0.99, 2, 'player 1') opponent = PongAi(env, 2) player = Agent(env, policy1, 0.0005, 0.99, 1, 'player 2') # print(player.policy.state_dict()) player.load_checkpoint('checkpoint-single-1/checkpoints-player-18000.pth') #player.load_checkpoint('checkpoints-3/checkpoints-player-9500.pth') #opponent.load_checkpoint('checkpoints-3/checkpoints-opponent-9500.pth') # player.policy.state_dict() def plot(observation):
# Load World # ---------- LEFT = 5 RIGHT = 4 env = gym.make('PongDeterministic-v4') print("List of available actions: ", env.unwrapped.get_action_meanings()) # Load Agent # ---------- from agent import Policy import torch.optim as optim agent = Policy().to(device) optimizer = optim.Adam(agent.parameters(), lr=1e-4) # Load Parallel Environment # ------------------------- from pong_utils import parallelEnv, preprocess_batch envs = parallelEnv('PongDeterministic-v4', n=4, seed=12345) # def collect_trajectories(envs, agent, tmax=200, nrand=5): ''' Collect trajectories of multiple agents of a parallelized environment ''' n = len(envs.ps)