def __init__(self, alpha, beta, input_dims, tau, env, env_id, gamma=0.99, n_actions=2, max_size=1000000, layer_1_size=256, layer_2_size=256, batch_size=100, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.scale = reward_scale self.actor = ActorNetwork(alpha, input_dims, layer_1_size, layer_2_size, n_actions=n_actions, name=env_id + '_actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, layer_1_size, layer_2_size, n_actions=n_actions, name=env_id + '_critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer_1_size, layer_2_size, n_actions=n_actions, name=env_id + '_critic_2') self.value = ValueNetwork(beta, input_dims, layer_1_size, layer_2_size, name=env_id + '_value') self.target_value = ValueNetwork(beta, input_dims, layer_1_size, layer_2_size, name=env_id + '_target_value') self.update_network_parameters(tau=1)
def __init__(self, load_from=None, will_train=True): self.env = TorcsEnv( path='/usr/local/share/games/torcs/config/raceman/quickrace.xml') self.args = SAC_args() self.buffer = ReplayBuffer(self.args.buffer_size) action_dim = self.env.action_space.shape[0] state_dim = self.env.observation_space.shape[0] hidden_dim = 256 self.action_size = action_dim self.state_size = state_dim self.value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.target_value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.target_value_net.load_state_dict(self.value_net.state_dict()) self.value_criterion = nn.MSELoss() self.soft_q_loss1 = nn.MSELoss() self.soft_q_loss2 = nn.MSELoss() self.value_opt = optim.Adam(self.value_net.parameters(), lr=self.args.lr) self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(), lr=self.args.lr) self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(), lr=self.args.lr) self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=self.args.lr) if will_train: current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime()) self.plot_folder = f'plots/{current_time}' self.model_save_folder = f'model/{current_time}' make_sure_dir_exists(self.plot_folder) make_sure_dir_exists(self.model_save_folder) self.cp = Checkpoint(self.model_save_folder) if load_from is not None: try: self.load_checkpoint(load_from) except FileNotFoundError: print(f'{load_from} not found. Running default.') else: print('Starting from scratch.')
def __init__(self, config): self.config = config # Create session self.session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) # Create networks self.prior_network = PolicyNetwork( scope=config.prior_network, temperature=config.prior_temperature, use_symmetry=config.use_symmetry) self.rollout_network = PolicyNetwork( scope=config.rollout_network, temperature=config.rollout_temperature, reuse=config.prior_network == config.rollout_network, use_symmetry=config.use_symmetry) self.value_network = ValueNetwork( scope=config.value_network, use_symmetry=config.use_symmetry) # Load networks from checkpoints run_dir = util.run_directory(config) util.restore_network_or_fail(self.session, run_dir, self.prior_network) util.restore_network_or_fail(self.session, run_dir, self.rollout_network) util.restore_network_or_fail(self.session, run_dir, self.value_network) # Create queues self.prior_queue = AllQueue() self.rollout_queue = AllQueue(maxsize=16) self.value_queue = AllQueue(maxsize=16) self.new_game()
def train(): tf.reset_default_graph() # Create a global step variable global_step = tf.Variable(0, name="global_step", trainable=False) policy_estimator = PolicyNetwork(input_size=len(env.state), output_size=env.action_space.n, summaries_dir=experiment_dir) value_estimator = ValueNetwork(input_size=len(env.state), output_size=1) # Object-aware Reward Network reward_estimator = ObjectAwareRewardNetwork(input_size=len(env.state), output_size=1, action_num=env.action_space.n) # # Reward Network # reward_estimator = RewardNetwork(input_size=len(env.state), output_size=1, action_num=env.action_space.n) saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.initialize_all_variables()) reinforce(env, policy_estimator, value_estimator, reward_estimator, max_num_episodes, sess, discount_factor=0.99, uniform_sample=False, saver=saver, model_dir=model_dir, figure_dir=figure_dir)
def __init__(self, alpha = 0.0003, beta = 0.0003, input_dims = [8], env = None, gamma = 0.99, tau = 0.005, n_actions = 2, max_size = 1000000, layer1_size = 256, layer2_size = 256, batch_size = 256, reward_scale = 2): self.gamma = gamma self.tau = tau self.batch_size = batch_size self.n_actions = n_actions self.scale = reward_scale self.memory = ReplayBuffer(max_size, input_dims, n_actions = n_actions) self.actor = ActorNetwork(alpha, input_dims, n_actions = n_actions, max_action = env.action_space.high) self.critic1 = CriticNetwork(beta, input_dims, n_actions = n_actions, name = 'critic1') self.critic2 = CriticNetwork(beta, input_dims, n_actions = n_actions, name = 'critic2') self.value = ValueNetwork(beta, input_dims, name = 'value') self.target_value = ValueNetwork(beta, input_dims, name = 'target') self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') self.update_network_params(tau = 1)
def __init__(self, alpha=0.0003, beta=.0003, input_dims=[8], env=None, gamma=.99, n_actions=2, max_size=1000000, layer1_size=256, layer2_size=256, tau=.005, batch_size=256, reward_scale=2): # reward scales depends on action convention for the environment self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions # set up classes self.actor = ActorNetwork(alpha, input_dims, max_action=env.action_space.high, n_actions=n_actions, name='actor') self.critic1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1') self.critic2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') # target value self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, config): self.config = config self.run_dir = util.run_directory(config) self.position_targets = PositionTargets(config, self.run_dir) self.session = tf.Session(config=tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True))) self.value_network = ValueNetwork('value') util.restore_or_initialize_network(self.session, self.run_dir, self.value_network) # Train ops self.create_train_op(self.value_network) self.writer = tf.summary.FileWriter(self.run_dir) util.restore_or_initialize_scope(self.session, self.run_dir, self.training_scope.name)
class Agent(): def __init__(self, alpha = 0.0003, beta = 0.0003, input_dims = [8], env = None, gamma = 0.99, tau = 0.005, n_actions = 2, max_size = 1000000, layer1_size = 256, layer2_size = 256, batch_size = 256, reward_scale = 2): self.gamma = gamma self.tau = tau self.batch_size = batch_size self.n_actions = n_actions self.scale = reward_scale self.memory = ReplayBuffer(max_size, input_dims, n_actions = n_actions) self.actor = ActorNetwork(alpha, input_dims, n_actions = n_actions, max_action = env.action_space.high) self.critic1 = CriticNetwork(beta, input_dims, n_actions = n_actions, name = 'critic1') self.critic2 = CriticNetwork(beta, input_dims, n_actions = n_actions, name = 'critic2') self.value = ValueNetwork(beta, input_dims, name = 'value') self.target_value = ValueNetwork(beta, input_dims, name = 'target') self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') self.update_network_params(tau = 1) def choose_action(self, obs): state = torch.tensor([obs],dtype=torch.float32).to(self.device) actions, _ = self.actor.sample_normal(state, reparam = False) return actions.cpu().detach().numpy()[0] def store_trans(self, state, action, reward, new_state, done): self.memory.store_trans(state, action, reward, new_state, done) def update_network_params(self, tau = None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_state_dict = dict(target_value_params) value_state_dict = dict(value_params) for name in value_state_dict.keys(): value_state_dict[name] = tau * value_state_dict[name].clone() + \ (1 - tau) * target_value_state_dict[name].clone() self.target_value.load_state_dict(value_state_dict) def save_models(self): self.actor.save_checkpoint() self.value.save_checkpoint() self.target_value.save_checkpoint() self.critic1.save_checkpoint() self.critic2.save_checkpoint() print('saving models') def load_models(self): self.actor.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() self.critic1.load_checkpoint() self.critic2.load_checkpoint() print('loading models') def get_critic_val_log_prob(self, state, reparam): actions, log_probs = self.actor.sample_normal(state, reparam = False) log_probs = log_probs.view(-1) q1_new = self.critic1(state, actions) q2_new = self.critic2(state, actions) critic_value = torch.min(q1_new, q2_new) critic_value = critic_value.view(-1) return log_probs, critic_value def learn(self): if self.memory.mem_counter < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) reward = torch.tensor(reward, dtype=torch.float).to(self.actor.device) done = torch.tensor(done).to(self.actor.device) state_ = torch.tensor(new_state, dtype=torch.float).to(self.actor.device) state = torch.tensor(state, dtype=torch.float).to(self.actor.device) action = torch.tensor(action, dtype=torch.float).to(self.actor.device) value = self.value(state).view(-1) value_ = self.target_value(state_).view(-1) value_[done] = 0.0 actions, log_probs = self.actor.sample_normal(state, reparam=False) log_probs = log_probs.view(-1) q1_new_policy = self.critic1.forward(state, actions) q2_new_policy = self.critic2.forward(state, actions) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value - log_probs value_loss = 0.5 * F.mse_loss(value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() actions, log_probs = self.actor.sample_normal(state, reparam=True) log_probs = log_probs.view(-1) q1_new_policy = self.critic1.forward(state, actions) q2_new_policy = self.critic2.forward(state, actions) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_probs - critic_value actor_loss = torch.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic1.optimizer.zero_grad() self.critic2.optimizer.zero_grad() q_hat = self.scale*reward + self.gamma*value_ q1_old_policy = self.critic1.forward(state, action).view(-1) q2_old_policy = self.critic2.forward(state, action).view(-1) critic1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic1_loss + critic2_loss critic_loss.backward() self.critic1.optimizer.step() self.critic2.optimizer.step() self.update_network_params()
def main(): # ENVIROMENT env_name = "CartPole-v1" # env_name = "LunarLander-v2" env = gym.make(env_name) n_actions = env.action_space.n feature_dim = env.observation_space.shape[0] # PARAMETERS learning_rate = 1e-3 state_scale = 1.0 reward_scale = 1.0 clip = 0.2 n_epoch = 4 max_episodes = 10 max_timesteps = 200 batch_size = 32 max_iterations = 200 gamma = 0.99 gae_lambda = 0.95 entropy_coefficient = 0.01 # NETWORK value_model = ValueNetwork(in_dim=feature_dim).to(device) value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate) policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device) policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate) # INIT history = History() observation = env.reset() epoch_ite = 0 episode_ite = 0 train_ite = 0 running_reward = -500 # TENSORBOARD timestr = time.strftime("%d%m%Y-%H%M%S-") log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \ str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \ "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda) writer = SummaryWriter(log_dir=log_dir) # LOAD MODEL # Create folder models if not Path("./models").exists(): print("Creating Models folder") Path("./models").mkdir() model_path = Path("./models/" + env_name + ".tar") if model_path.exists(): print("Loading model!") #Load model checkpoint = torch.load(model_path) policy_model.load_state_dict(checkpoint['policy_model']) policy_optimizer.load_state_dict(checkpoint['policy_optimizer']) value_model.load_state_dict(checkpoint['value_model']) value_optimizer.load_state_dict(checkpoint['value_optimizer']) running_reward = checkpoint['running_reward'] for ite in tqdm(range(max_iterations), ascii=True): if ite % 5 == 0: torch.save({ 'policy_model': policy_model.state_dict(), 'policy_optimizer': policy_optimizer.state_dict(), 'value_model': value_model.state_dict(), 'value_optimizer': value_optimizer.state_dict(), 'running_reward': running_reward}, model_path) episode_ite, running_reward = collect(episode_ite, running_reward, env, max_episodes, max_timesteps, state_scale, reward_scale, writer, history, policy_model, value_model, gamma, gae_lambda, device) # Here we have collected N trajectories. history.build_dataset() data_loader = DataLoader(history, batch_size=batch_size, shuffle=True, drop_last=True) policy_loss, value_loss, train_ite = train_network(data_loader, policy_model, value_model, policy_optimizer, value_optimizer ,n_epoch, clip, train_ite, writer, entropy_coefficient) for p_l, v_l in zip(policy_loss, value_loss): epoch_ite += 1 writer.add_scalar("Policy Loss", p_l, epoch_ite) writer.add_scalar("Value Loss", v_l, epoch_ite) history.free_memory() # print("\n", running_reward) writer.add_scalar("Running Reward", running_reward, epoch_ite) if (running_reward > env.spec.reward_threshold): print("\nSolved!") break
def main(env_name, lr, state_scale, reward_scale, clip, train_epoch, max_episodes, max_timesteps, batch_size, max_iterations, gamma, gae_lambda, entropy_coefficient, start_running_reward, update_rate): # ENVIROMENT env_name = env_name env = ChessEnv() # PARAMETERS learning_rate = lr state_scale = state_scale reward_scale = reward_scale clip = clip n_epoch = train_epoch max_episodes = max_episodes max_timesteps = max_timesteps batch_size = batch_size max_iterations = max_iterations gamma = gamma gae_lambda = gae_lambda entropy_coefficient = entropy_coefficient # NETWORK value_model = ValueNetwork().to(device) value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate) policy_model = PolicyNetwork().to(device) policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate) # INIT history = History() epoch_ite = 0 episode_ite = 0 train_ite = 0 running_reward = start_running_reward # TENSORBOARD timestr = time.strftime("%d%m%Y-%H%M%S-") log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \ str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \ "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda) writer = SummaryWriter(log_dir=log_dir) # LOAD MODEL # Create folder models if not Path("./models").exists(): print("Creating Models folder") Path("./models").mkdir() model_path = Path("./models/" + env_name + ".tar") if model_path.exists(): print("Loading model!") #Load model checkpoint = torch.load(model_path) policy_model.load_state_dict(checkpoint['policy_model']) policy_optimizer.load_state_dict(checkpoint['policy_optimizer']) value_model.load_state_dict(checkpoint['value_model']) value_optimizer.load_state_dict(checkpoint['value_optimizer']) running_reward = checkpoint['running_reward'] # Create SavedEnvs queue SavedEnv = queue.SimpleQueue() for _ in range(max_episodes): env = ChessEnv() SavedEnv.put((env, env.reset(), 0)) # START ITERATING for ite in tqdm(range(max_iterations), ascii=True): # Load model to rival each update_rate epochs if ite % update_rate == 0: print("\nUpdating") rival_policy = PolicyNetwork().to(device) rival_policy.load_state_dict(policy_model.state_dict()) if ite % 5 == 0: torch.save( { 'policy_model': policy_model.state_dict(), 'policy_optimizer': policy_optimizer.state_dict(), 'value_model': value_model.state_dict(), 'value_optimizer': value_optimizer.state_dict(), 'running_reward': running_reward }, model_path) print("\nSimulating") start_simulation = time.perf_counter() q = queue.SimpleQueue() env_list = [] while not SavedEnv.empty(): env_list.append(SavedEnv.get()) threads = [] for saved_env in env_list: t = threading.Thread(target=collect, args=[ q, env_name, saved_env, SavedEnv, max_timesteps, state_scale, reward_scale, policy_model, value_model, gamma, gae_lambda, device, rival_policy ]) t.start() threads.append(t) for t in threads: t.join() # for saved_env in env_list: # if ite % 20 == 0: # update_policy = True # else: # update_policy = False # collect(q, env_name, saved_env, # SavedEnv, max_timesteps, state_scale, reward_scale, # policy_model, value_model, gamma, # gae_lambda, device, update_policy) avg_episode_reward = [] # Write all episodes from queue to history buffer while not q.empty(): episode, done = q.get() history.episodes.append(episode) avg_episode_reward.append((episode.reward, done)) end_simulation = time.perf_counter() print(f"Simulation time: {end_simulation-start_simulation:.2f} ") for ep_reward, done in avg_episode_reward: if done: running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward writer.add_scalar("Average Episode Reward", ep_reward, episode_ite) episode_ite += 1 # avg_ep_reward = sum(avg_episode_reward) / len(avg_episode_reward) # Here we have collected N trajectories and prepare dataset history.build_dataset() data_loader = DataLoader(history, batch_size=batch_size, shuffle=True, drop_last=True) print("Training") policy_loss, value_loss, train_ite = train_network( data_loader, policy_model, value_model, policy_optimizer, value_optimizer, n_epoch, clip, train_ite, writer, entropy_coefficient) end_training = time.perf_counter() print(f"Training time: {end_training-end_simulation:.2f}") for p_l, v_l in zip(policy_loss, value_loss): epoch_ite += 1 writer.add_scalar("Policy Loss", p_l, epoch_ite) writer.add_scalar("Value Loss", v_l, epoch_ite) history.free_memory() # print("\n", running_reward) writer.add_scalar("Running Reward", running_reward, epoch_ite) if (running_reward > 0): print("\nSolved!") break
def main(): # ENVIROMENT # env_name = "CartPole-v1" # env_name = "LunarLander-v2" # env_name = "Acrobot-v1" env_name = "MountainCar-v0" env = gym.make(env_name) n_actions = env.action_space.n feature_dim = env.observation_space.shape[0] # PARAMETERS learning_rate = 1e-3 state_scale = 1.0 reward_scale = 1.0 clip = 0.2 n_epoch = 4 max_episodes = 10 max_timesteps = 100 batch_size = 32 max_iterations = 1000 gamma = 0.99 gae_lambda = 0.95 entropy_coefficient = 0.01 env_threshold = env.spec.reward_threshold # NETWORK value_model = ValueNetwork(in_dim=feature_dim).to(device) value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate) policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device) policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate) # INIT history = History() observation = env.reset() epoch_ite = 0 episode_ite = 0 train_ite = 0 running_reward = -500 # TENSORBOARD timestr = time.strftime("%d%m%Y-%H%M%S-") log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \ str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \ "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda) writer = SummaryWriter(log_dir=log_dir) # LOAD MODEL # Create folder models if not Path("./models").exists(): print("Creating Models folder") Path("./models").mkdir() model_path = Path("./models/" + env_name + ".tar") if model_path.exists(): print("Loading model!") #Load model checkpoint = torch.load(model_path) policy_model.load_state_dict(checkpoint['policy_model']) policy_optimizer.load_state_dict(checkpoint['policy_optimizer']) value_model.load_state_dict(checkpoint['value_model']) value_optimizer.load_state_dict(checkpoint['value_optimizer']) running_reward = checkpoint['running_reward'] EnvQueue = queue.SimpleQueue() for _ in range(max_episodes): env = gym.make(env_name) observation = env.reset() EnvQueue.put((env, observation, 0)) for ite in tqdm(range(max_iterations), ascii=True): if ite % 5 == 0: torch.save( { 'policy_model': policy_model.state_dict(), 'policy_optimizer': policy_optimizer.state_dict(), 'value_model': value_model.state_dict(), 'value_optimizer': value_optimizer.state_dict(), 'running_reward': running_reward }, model_path) q = queue.SimpleQueue() env_list = [] while not EnvQueue.empty(): env_list.append(EnvQueue.get()) threads = [] for env in env_list: t = threading.Thread(target=collect, args=[ q, env_name, env, EnvQueue, max_timesteps, state_scale, reward_scale, policy_model, value_model, gamma, gae_lambda, device ]) t.start() threads.append(t) for t in threads: t.join() avg_episode_reward = [] # Write all episodes from queue to history buffer while not q.empty(): episode, done = q.get() history.episodes.append(episode) avg_episode_reward.append((episode.reward, done)) for ep_reward, done in avg_episode_reward: if done: running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward writer.add_scalar("Running Reward", running_reward, episode_ite) writer.add_scalar("Episode Reward", ep_reward, episode_ite) episode_ite += 1 # avg_ep_reward = sum(avg_episode_reward) / len(avg_episode_reward) # Here we have collected N trajectories and prepare dataset history.build_dataset() data_loader = DataLoader(history, batch_size=batch_size, shuffle=True, drop_last=True) policy_loss, value_loss, train_ite = train_network( data_loader, policy_model, value_model, policy_optimizer, value_optimizer, n_epoch, clip, train_ite, writer, entropy_coefficient) for p_l, v_l in zip(policy_loss, value_loss): epoch_ite += 1 writer.add_scalar("Policy Loss", p_l, epoch_ite) writer.add_scalar("Value Loss", v_l, epoch_ite) history.free_memory() # print("\n", running_reward) if (running_reward > env_threshold): print("\nSolved!") break
class SAC_Agent: def __init__(self, load_from=None, will_train=True): self.env = TorcsEnv( path='/usr/local/share/games/torcs/config/raceman/quickrace.xml') self.args = SAC_args() self.buffer = ReplayBuffer(self.args.buffer_size) action_dim = self.env.action_space.shape[0] state_dim = self.env.observation_space.shape[0] hidden_dim = 256 self.action_size = action_dim self.state_size = state_dim self.value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.target_value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.target_value_net.load_state_dict(self.value_net.state_dict()) self.value_criterion = nn.MSELoss() self.soft_q_loss1 = nn.MSELoss() self.soft_q_loss2 = nn.MSELoss() self.value_opt = optim.Adam(self.value_net.parameters(), lr=self.args.lr) self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(), lr=self.args.lr) self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(), lr=self.args.lr) self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=self.args.lr) if will_train: current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime()) self.plot_folder = f'plots/{current_time}' self.model_save_folder = f'model/{current_time}' make_sure_dir_exists(self.plot_folder) make_sure_dir_exists(self.model_save_folder) self.cp = Checkpoint(self.model_save_folder) if load_from is not None: try: self.load_checkpoint(load_from) except FileNotFoundError: print(f'{load_from} not found. Running default.') else: print('Starting from scratch.') def train(self): remove_log_file() clear_action_logs() eps_n = 0 rewards = [] test_rewards = [] best_reward = -np.inf info = None for eps_n in range(1, self.args.max_eps + 1): # Train loop self.set_mode('train') relaunch = (eps_n - 1) % (20 / self.args.test_rate) == 0 state = self.env.reset(relaunch=relaunch, render=False, sampletrack=False) eps_r = 0 sigma = (self.args.start_sigma - self.args.end_sigma) * (max( 0, 1 - (eps_n - 1) / self.args.max_eps)) + self.args.end_sigma randomprocess = OrnsteinUhlenbeckProcess(self.args.theta, sigma, self.action_size) for step in range(self.args.max_eps_time): # Episode action = self.policy_net.get_train_action(state, randomprocess) next_state, reward, done, info = self.env.step(action) self.buffer.push(state, action, reward, next_state, done) state = next_state eps_r += reward if len(self.buffer) > self.args.batch_size: self.update() if done: break rewards.append(eps_r) test_reward = self.test(eps_n) test_rewards.append(test_reward) if test_reward > best_reward: best_reward = test_reward self.save_checkpoint(eps_n, best_reward) info_str = ', '.join( [key for key in info.keys() if key != 'place']) info_str += f", {info['place']}. place" log(f'Episode {eps_n:<4} Reward: {eps_r:>7.2f} Test Reward: {test_reward:>7.2f} Info: {info_str}' ) if eps_n % self.args.plot_per == 0: self.plot(rewards, test_rewards, eps_n) def update(self): state, action, reward, next_state, done = self.buffer.sample( self.args.batch_size) state = FloatTensor(state).to(self.args.device) next_state = FloatTensor(next_state).to(self.args.device) action = FloatTensor(action).to(self.args.device) reward = FloatTensor(reward).unsqueeze(1).to(self.args.device) done = FloatTensor(np.float32(done)).unsqueeze(1).to(self.args.device) predicted_q_value1 = self.soft_q_net1(state, action) predicted_q_value2 = self.soft_q_net2(state, action) predicted_value = self.value_net(state) new_action, log_prob, epsilon, mean, log_std = self.policy_net.evaluate( state) # Training Q function target_value = self.target_value_net(next_state) target_q_value = reward + (1 - done) * self.args.gamma * target_value q_value_loss1 = self.soft_q_loss1(predicted_q_value1, target_q_value.detach()) q_value_loss2 = self.soft_q_loss2(predicted_q_value2, target_q_value.detach()) self.soft_q_opt1.zero_grad() q_value_loss1.backward() if self.args.clipgrad: self.clip_grad(self.soft_q_net1.parameters()) self.soft_q_opt1.step() self.soft_q_opt2.zero_grad() q_value_loss2.backward() if self.args.clipgrad: self.clip_grad(self.soft_q_net2.parameters()) self.soft_q_opt2.step() # Training Value function predicted_new_q_value = torch.min(self.soft_q_net1(state, new_action), self.soft_q_net2(state, new_action)) target_value_func = predicted_new_q_value - self.args.alpha * log_prob.sum( ) value_loss = self.value_criterion(predicted_value, target_value_func.detach()) self.value_opt.zero_grad() value_loss.backward() if self.args.clipgrad: self.clip_grad(self.value_net.parameters()) self.value_opt.step() # Training Policy function policy_loss = (log_prob - predicted_new_q_value).mean() self.policy_opt.zero_grad() policy_loss.backward() if self.args.clipgrad: self.clip_grad(self.policy_net.parameters()) self.policy_opt.step() # Updating target value network for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.args.soft_tau) + param.data * self.args.soft_tau) def test(self, eps_n): self.set_mode('eval') rewards = [] for step in range(self.args.test_rate): render = (eps_n % 30 == 0) and (step == 0) relaunch = render or ((eps_n % 30 == 0) and (step == 1)) state = self.env.reset(relaunch=relaunch, render=render, sampletrack=False) running_reward = 0 for t in range(self.args.max_eps_time): action = self.policy_net.get_test_action(state) state, reward, done, info = self.env.step(action) store(action, eps_n, reward, info, t == 0) running_reward += reward if done: break rewards.append(running_reward) avg_reward = sum(rewards) / self.args.test_rate return avg_reward def plot(self, rewards, test_rewards, eps_n): torch.save({ 'train_rewards': rewards, 'test_rewards': test_rewards }, f'{self.plot_folder}/{eps_n}.pth') figure = plt.figure() plt.plot(rewards, label='Train Rewards') plt.plot(test_rewards, label='Test Rewards') plt.xlabel('Episode') plt.legend() plt.savefig(f'{self.plot_folder}/{eps_n}.png') try: send_mail(f'Improved Torcs SAC | Episode {eps_n}', f'{self.plot_folder}/{eps_n}.png') log('Mail has been sent.') except (KeyboardInterrupt, SystemExit): print('KeyboardInterrupt or SystemExit') raise except Exception as e: print('Mail Exception occured:', e) emsg = e.args[-1] emsg = emsg[:1].lower() + emsg[1:] log('Couldn\'t send mail because', emsg) def clip_grad(self, parameters): for param in parameters: param.grad.data.clamp_(-1, 1) def set_mode(self, mode): if mode == 'train': self.value_net.train() self.target_value_net.train() self.soft_q_net1.train() self.soft_q_net2.train() self.policy_net.train() elif mode == 'eval': self.value_net.eval() self.target_value_net.eval() self.soft_q_net1.eval() self.soft_q_net2.eval() self.policy_net.eval() else: raise ValueError('mode should be either train or eval') def save_checkpoint(self, eps_n, test_reward): self.cp.update(self.value_net, self.soft_q_net1, self.soft_q_net2, self.policy_net) self.cp.save(f'e{eps_n}-r{test_reward:.4f}.pth') log(f'Saved checkpoint at episode {eps_n}.') def load_checkpoint(self, load_from): state_dicts = torch.load(load_from) self.value_net.load_state_dict(state_dicts['best_value']) self.soft_q_net1.load_state_dict(state_dicts['best_q1']) self.soft_q_net2.load_state_dict(state_dicts['best_q2']) self.policy_net.load_state_dict(state_dicts['best_policy']) print(f'Loaded from {load_from}.') def race(self, sampletrack=True): with torch.no_grad(): state = self.env.reset(relaunch=True, render=True, sampletrack=sampletrack) running_reward = 0 done = False while not done: action = self.policy_net.get_test_action(state) state, reward, done, info = self.env.step(action) running_reward += reward print('Reward:', running_reward)
class Agent(): def __init__(self, alpha, beta, input_dims, tau, env, env_id, gamma=0.99, n_actions=2, max_size=1000000, layer_1_size=256, layer_2_size=256, batch_size=100, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.scale = reward_scale self.actor = ActorNetwork(alpha, input_dims, layer_1_size, layer_2_size, n_actions=n_actions, name=env_id + '_actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, layer_1_size, layer_2_size, n_actions=n_actions, name=env_id + '_critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer_1_size, layer_2_size, n_actions=n_actions, name=env_id + '_critic_2') self.value = ValueNetwork(beta, input_dims, layer_1_size, layer_2_size, name=env_id + '_value') self.target_value = ValueNetwork(beta, input_dims, layer_1_size, layer_2_size, name=env_id + '_target_value') self.update_network_parameters(tau=1) def choose_action(self, observation): state = T.tensor([observation], dtype=T.float).to(self.actor.device) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, state_, done): self.memory.store_transitions(state, action, reward, state_, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau value_params = self.value.named_parameters() target_value_params = self.target_value.named_parameters() value_state_dict = dict(value_params) target_value_state_dict = dict(target_value_params) for name in value_state_dict: value_state_dict[name] = tau*value_state_dict[name].clone() \ + (1-tau)*target_value_state_dict[name].clone() self.target_value.load_state_dict(value_state_dict) def save_models(self): print('.... saving models ....') self.actor.save_checkpoint() self.value.save_checkpoint() self.target_value.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() def load_models(self): print('.... loading models ....') self.actor.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, state_, done =\ self.memory.sample_buffer(self.batch_size) state = T.tensor(state, dtype=T.float).to(self.critic_1.device) state_ = T.tensor(state_, dtype=T.float).to(self.critic_1.device) action = T.tensor(action, dtype=T.float).to(self.critic_1.device) reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device) done = T.tensor(done).to(self.critic_1.device) value = self.value(state).view(-1) value_ = self.target_value(state_).view(-1) value_[done] = 0.0 actions, log_probs = self.actor.sample_normal(state, reparameterize=False) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1(state, actions) q2_new_policy = self.critic_2(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value - log_probs value_loss = 0.5 * F.mse_loss(value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() actions, log_probs = self.actor.sample_normal(state, reparameterize=True) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1(state, actions) q2_new_policy = self.critic_2(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_probs - critic_value actor_loss = T.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q_hat = self.scale * reward + self.gamma * value_ q1_old_policy = self.critic_1(state, action).view(-1) q2_old_policy = self.critic_2(state, action).view(-1) critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters()
numrun = 1 for run in range(numrun): env = make_env() in_size = env.observation_space.shape[0] num_actions = env.action_space.n network = network_factory(in_size, num_actions, env) network.to(device) pe = PolicyNetwork(network) # Load policy to test #pe.network.load_state_dict(torch.load('saved_network_50000_baseline.pkl')) ve = ValueNetwork(in_size) ep_returns = reinforce(env, pe, ve, episodes) #,ve , loss_policy, loss_value #fwrite = open('runs_data/'+str(run)+'.pkl','wb') #fwrite = open('runs_data/0.pkl','wb') #pickle.dump(ep_returns, fwrite) #fwrite.close() window = 100 plt.figure(figsize=(12,8)) plt.plot(sliding_window(ep_returns, window)) plt.title("Episode Return") plt.xlabel("Episode") plt.ylabel("Average Return (Sliding Window 100)")
class Agent(): def __init__(self, alpha=0.0003, beta=.0003, input_dims=[8], env=None, gamma=.99, n_actions=2, max_size=1000000, layer1_size=256, layer2_size=256, tau=.005, batch_size=256, reward_scale=2): # reward scales depends on action convention for the environment self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions # set up classes self.actor = ActorNetwork(alpha, input_dims, max_action=env.action_space.high, n_actions=n_actions, name='actor') self.critic1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1') self.critic2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') # target value self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): # here we turn into a tensor state = T.tensor([observation]).to(self.actor.device).float() # print(type(state)) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_state_dict = dict(target_value_params) value_state_dict = dict(value_params) for name in value_state_dict: value_state_dict[name] = tau * value_state_dict[name].clone() + ( 1 - tau) * target_value_state_dict[name].clone() self.target_value.load_state_dict(value_state_dict) def save_models(self): print("saving models:") self.actor.save_checkpoint() self.value.save_checkpoint() self.target_value.save_checkpoint() self.critic1.save_checkpoint() self.critic2.save_checkpoint() def load_models(self): print("loading models:") self.actor.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() self.critic1.load_checkpoint() self.critic2.load_checkpoint() def learn(self): # must fully load up memory, otherwise must keep learning if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = self.memory.sample_buffer( self.batch_size) reward = T.tensor(reward, dtype=T.float).to(self.actor.device) done = T.tensor(done).to(self.actor.device) state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device) state = T.tensor(state, dtype=T.float).to(self.actor.device) action = T.tensor(action, dtype=T.float).to(self.actor.device) value = self.value(state).view(-1) value_ = self.target_value(state_).view(-1) value_[done] = 0.0 actions, log_probs = self.actor.sample_normal(state, reparameterize=False) log_probs = log_probs.view(-1) q1_new_policy = self.critic1.forward(state, actions) q2_new_policy = self.critic2.forward(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value - log_probs value_loss = .5 * F.mse_loss(value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() actions, log_probs = self.actor.sample_normal(state, reparameterize=True) log_probs = log_probs.view(-1) q1_new_policy = self.critic1.forward(state, actions) q2_new_policy = self.critic2.forward(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_probs - critic_value actor_loss = T.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic1.optimizer.zero_grad() self.critic2.optimizer.zero_grad() q_hat = self.scale * reward + self.gamma * value_ q1_old_policy = self.critic1.forward(state, action).view(-1) q2_old_policy = self.critic2.forward(state, action).view(-1) critic_1_loss = .5 * F.mse_loss(q1_old_policy, q_hat) critic_2_loss = .5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic1.optimizer.step() self.critic2.optimizer.step() self.update_network_parameters()