def __init__(self, state_dim, action_dim, agentParam): self.state_dim = state_dim #400#env.observation_space.shape[0] self.action_dim = action_dim # 8#env.action_space.n self.gamma = agentParam["gamma"] # init N Monte Carlo transitions in one game self.saved_log_probs = [] self.use_cuda = torch.cuda.is_available() self.FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor self.rewards = [] self.device = agentParam["device"] # init network parameters if agentParam["ifload"]: self.policy = torch.load(agentParam["filename"] + "pg" + agentParam["id"] + ".pth", map_location=torch.device('cuda')) else: self.policy = Policy(state_dim=self.state_dim, action_dim=self.action_dim).to(self.device) self.optimizer = optim.Adam(self.policy.parameters(), lr=agentParam["LR"]) self.eps = np.finfo(np.float32).eps.item() # init some parameters self.time_step = 0
def test_policy(): tf.reset_default_graph() tf.set_random_seed(0) policy = Policy('global', policy_spec={ "input size": 2, "hidden layer size": 2, "number of actions": 2}) print("Policy Tests: ") print("-------------------------------------------------") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) results = sess.run([policy.action, policy.policy_fn, policy.value_fn],feed_dict={ policy.input: np.array([[1, 2]]), policy.exploration_rate: 1, }) with shelve.open(os.path.join(os.path.dirname(__file__), 'data/network_tests')) as db: if 'policy' not in db: print(results) db['policy'] = results elif not np.array([np.any(r == t) for r, t in zip(results, db['policy'])]).all(): print(results) print(db['policy']) if input("test_policy: Results didn't match. Update resutls? ") == "yes": db['policy'] = results else: print("test_policy: Test failed!") exit() else: print("test_policy: Test passed!") print()
def __init__(self, args): tmp_env = make_env(args.env) self.obs_shape = tmp_env.observation_space.shape self.num_actions = tmp_env.action_space.n self.c_in = self.obs_shape[0] del tmp_env self.horizon = args.horizon self.eta = args.eta self.epoch = args.epoch self.batch_size = args.batch * args.actors self.gamma = args.gamma self.lam = args.lam self.num_actors = args.actors self.eps = args.eps self.num_iter = ( args.epoch * args.actors * args.horizon ) // self.batch_size # how many times to run SGD on the buffer self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.queues = [Queue() for i in range(self.num_actors)] self.barrier = Queue( ) # This is used as a waiting mechanism, to wait for all the agents to env.step() self.score_channel = Queue() # these are shmem np.arrays self.state, self.reward, self.finished = self.init_shared() self.workers = [ Worker(i, args.env, self.queues[i], self.barrier, self.state, self.reward, self.finished, self.score_channel) for i in range(self.num_actors) ] self.start_workers() self.model = Policy(self.c_in, self.num_actions).to(self.device) self.optim = torch.optim.Adam(self.model.parameters(), lr=self.eta) # used for logging and graphing self.stat = { 'scores': [], 'steps': [], 'clip_losses': [], 'value_losses': [], 'entropies': [] }
class PGagent(): def __init__(self, state_dim, action_dim, agentParam): self.state_dim = state_dim #400#env.observation_space.shape[0] self.action_dim = action_dim # 8#env.action_space.n self.gamma = agentParam["gamma"] # init N Monte Carlo transitions in one game self.saved_log_probs = [] self.use_cuda = torch.cuda.is_available() self.FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor self.rewards = [] self.device = agentParam["device"] # init network parameters if agentParam["ifload"]: self.policy = torch.load(agentParam["filename"] + "pg" + agentParam["id"] + ".pth", map_location=torch.device('cuda')) else: self.policy = Policy(state_dim=self.state_dim, action_dim=self.action_dim).to(self.device) self.optimizer = optim.Adam(self.policy.parameters(), lr=agentParam["LR"]) self.eps = np.finfo(np.float32).eps.item() # init some parameters self.time_step = 0 def select_action(self, state): state = torch.from_numpy(state).float().unsqueeze(0) probs = self.policy(state.to(self.device)) m = Categorical(probs) action = m.sample() self.saved_log_probs.append(m.log_prob(action).to(self.device)) return action.item() def update(self): R = 0 policy_loss = [] returns = [] for r in self.rewards[::-1]: R = r + self.gamma * R returns.insert(0, R) returns = torch.tensor(returns).to(self.device).type(self.FloatTensor) returns = (returns - returns.mean()) / (returns.std() + self.eps) for log_prob, R in zip(self.saved_log_probs, returns): policy_loss.append(-log_prob * R) self.optimizer.zero_grad() policy_loss = torch.cat(policy_loss).sum() policy_loss.backward() self.optimizer.step() del self.rewards[:] del self.saved_log_probs[:]
def __init__(self, env_name, batch_size, gamma, use_random_features): self.random = use_random_features self.batch_size = batch_size # batch_size == number of envs self.queues = [Queue() for i in range(batch_size)] self.barrier = Queue( ) # use to block Trainer until all envs finish updating self.channel = Queue( ) # envs send their total scores after each episode tmp_env = make_env(env_name) self.c_in = tmp_env.observation_space.shape[0] self.num_actions = tmp_env.action_space.n mean, std = self.mean_std_from_random_agent(tmp_env, 10000) # sh_state is shared between processes self.sh_state = self.init_shared(tmp_env.observation_space.shape) self.workers = [ Worker(i, env_name, self.queues[i], self.barrier, self.channel, self.sh_state, mean, std) for i in range(batch_size) ] self.start_workers() self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.gamma = gamma # reward discounting factor self.model = Policy(self.c_in, self.num_actions).to(self.device) self.icm = IntrinsicCuriosityModule(self.c_in, self.num_actions, self.random).to(self.device) self.optim = torch.optim.Adam(list(self.model.parameters()) + list(self.icm.parameters()), lr=1e-3) self.cross_entropy = torch.nn.CrossEntropyLoss()
class Trainer: def __init__(self, env_name, batch_size, gamma, use_random_features): self.random = use_random_features self.batch_size = batch_size # batch_size == number of envs self.queues = [Queue() for i in range(batch_size)] self.barrier = Queue( ) # use to block Trainer until all envs finish updating self.channel = Queue( ) # envs send their total scores after each episode tmp_env = make_env(env_name) self.c_in = tmp_env.observation_space.shape[0] self.num_actions = tmp_env.action_space.n mean, std = self.mean_std_from_random_agent(tmp_env, 10000) # sh_state is shared between processes self.sh_state = self.init_shared(tmp_env.observation_space.shape) self.workers = [ Worker(i, env_name, self.queues[i], self.barrier, self.channel, self.sh_state, mean, std) for i in range(batch_size) ] self.start_workers() self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.gamma = gamma # reward discounting factor self.model = Policy(self.c_in, self.num_actions).to(self.device) self.icm = IntrinsicCuriosityModule(self.c_in, self.num_actions, self.random).to(self.device) self.optim = torch.optim.Adam(list(self.model.parameters()) + list(self.icm.parameters()), lr=1e-3) self.cross_entropy = torch.nn.CrossEntropyLoss() def reset_workers(self): for q in self.queues: q.put(-1) def broadcast_actions(self, actions): for i in range(self.batch_size): self.queues[i].put(actions[i].item()) def start_workers(self): for worker in self.workers: worker.start() def stop_workers(self): for q in self.queues: q.put(None) def wait_for_workers(self): for i in range(self.batch_size): self.barrier.get() def init_shared(self, obs_shape): shape = (self.batch_size, ) + obs_shape state = np.zeros(shape, dtype=np.float32) state = RawArray(c_float, state.reshape(-1)) state = np.frombuffer(state, c_float).reshape(shape) return state @staticmethod def mean_std_from_random_agent(env, steps): obs = np.empty((steps, ) + env.observation_space.shape, dtype=np.float32) env.reset() for i in range(steps): state, _, done, _ = env.step(env.action_space.sample()) obs[i] = np.array(state) if done: env.reset() mean = np.mean(obs, 0) std = np.std(obs, 0).mean() return mean, std def train(self, T_max, graph_name=None): step = 0 self.num_lookahead = 5 self.reset_workers() self.wait_for_workers() stat = { 'ploss': [], 'vloss': [], 'score': [], 'int_reward': [], 'entropy': [], 'fwd_kl_div': [], 'running_loss': 0 } reward_tracker = RunningMeanStd() reward_buffer = np.empty((self.batch_size, self.num_lookahead), dtype=np.float32) while step < T_max: # these will keep tensors, which we'll use later for backpropagation values = [] log_probs = [] rewards = [] entropies = [] actions = [] actions_pred = [] features = [] features_pred = [] state = torch.from_numpy(self.sh_state).to(self.device) for i in range(self.num_lookahead): step += self.batch_size logit, value = self.model(state) prob = torch.softmax(logit, dim=1) log_prob = torch.log_softmax(logit, dim=1) entropy = -(prob * log_prob).sum(1, keepdim=True) action = prob.multinomial(1) sampled_lp = log_prob.gather(1, action) # one-hot action oh_action = torch.zeros(self.batch_size, self.num_actions, device=self.device).scatter_( 1, action, 1) self.broadcast_actions(action) self.wait_for_workers() next_state = torch.from_numpy(self.sh_state).to(self.device) s1, s1_pred, action_pred = self.icm(state, oh_action, next_state) with torch.no_grad(): int_reward = 0.5 * (s1 - s1_pred).pow(2).sum(dim=1, keepdim=True) reward_buffer[:, i] = int_reward.cpu().numpy().ravel() state = next_state # save variables for gradient descent values.append(value) log_probs.append(sampled_lp) rewards.append(int_reward) entropies.append(entropy) if not self.random: actions.append(action.flatten()) actions_pred.append(action_pred) features.append(s1) features_pred.append(s1_pred) stat['entropy'].append(entropy.sum(dim=1).mean().item()) stat['fwd_kl_div'].append( torch.kl_div(s1_pred, s1).mean().item()) # may have to update reward_buffer with gamma first reward_mean, reward_std, count = mpi_moments(reward_buffer.ravel()) reward_tracker.update_from_moments(reward_mean, reward_std**2, count) std = np.sqrt(reward_tracker.var) rewards = [rwd / std for rwd in rewards] for rwd in rewards: stat['int_reward'].append(rwd.mean().item()) state = torch.from_numpy(self.sh_state.astype(np.float32)).to( self.device) with torch.no_grad(): _, R = self.model(state) # R is the estimated return values.append(R) ploss = 0 vloss = 0 fwd_loss = 0 inv_loss = 0 delta = torch.zeros((self.batch_size, 1), dtype=torch.float, device=self.device) for i in reversed(range(self.num_lookahead)): R = rewards[i] + self.gamma * R advantage = R - values[i] vloss += (0.5 * advantage.pow(2)).mean() delta = rewards[i] + self.gamma * values[ i + 1].detach() - values[i].detach() ploss += -(log_probs[i] * delta + 0.01 * entropies[i]).mean() # beta = 0.01 fwd_loss += 0.5 * (features[i] - features_pred[i]).pow(2).sum(dim=1).mean() if not self.random: inv_loss += self.cross_entropy(actions_pred[i], actions[i]) self.optim.zero_grad() # inv_loss is 0 if using random features loss = ploss + vloss + fwd_loss + inv_loss # 2018 Large scale curiosity paper simply sums them (no lambda and beta anymore) loss.backward() torch.nn.utils.clip_grad_norm_( list(self.model.parameters()) + list(self.icm.parameters()), 40) self.optim.step() while not self.channel.empty(): score = self.channel.get() stat['score'].append(score) stat['ploss'].append(ploss.item() / self.num_lookahead) stat['vloss'].append(vloss.item() / self.num_lookahead) stat['running_loss'] = 0.99 * stat[ 'running_loss'] + 0.01 * loss.item() / self.num_lookahead if len(stat['score']) > 20 and step % (self.batch_size * 1000) == 0: now = datetime.datetime.now().strftime("%H:%M") print( f"Step {step: <10} | Running loss: {stat['running_loss']:.4f} | Running score: {np.mean(stat['score'][-10:]):.2f} | Time: {now}" ) if graph_name is not None and step % (self.batch_size * 10000) == 0: plot(step, stat['score'], stat['int_reward'], stat['ploss'], stat['vloss'], stat['entropy'], name=graph_name)
from tqdm import tqdm from datetime import datetime from time import sleep from loss import PerfPolicy, PerfValue from math import sqrt import sys #params gamma = 0.998 limit = 5e3 path_to_chkpt = 'weights.tar' cpu = torch.device('cpu') #pylint: disable=no-member gpu = torch.device('cuda:0') #pylint: disable=no-member #networks P = Policy() V = Value() need_pretrained = not os.path.isfile(path_to_chkpt) gym = EggnoggGym(need_pretrained, gpu) #network in gym.observation #performance measures Perf_p = PerfPolicy() Perf_v = PerfValue() #info episode = 1 episode_len = [] #init save upon new start if need_pretrained: """print('Initializing weights...')
def test(model_name, goal_pos=1, EWC_flag=True): episode_len = 50 # Length of each game. obs_size = 7 * 7 # MiniGrid uses a 7x7 window of visibility. act_size = 7 # Seven possible actions (turn left, right, forward, pickup, drop, etc.) inner_size = 64 # Number of neurons in two hidden layers. avg_reward = 0.0 # For tracking average regard per episode. env_name = 'MiniGrid-Empty-8x8-v0' # Size of the grid test_avg_reward = open( "data-{model}/test_avg_rewards.txt".format(model=model_name), 'a+') # Setup OpenAI Gym environment for guessing game. env = gym.make(env_name) if goal_pos == 2: env.set_posX(2) env.set_posY(5) elif goal_pos == 3: env.set_posX(5) env.set_posY(2) # Check the model directory last_checkpoint = utils.search_last_model('torch_models/', model_name) # Instantiate a policy network policy = Policy(obs_size=obs_size, act_size=act_size, inner_size=inner_size) policy.load_state_dict( torch.load("torch_models/{model}/{model}-{step}.pth".format( model=model_name, step=last_checkpoint))) if EWC_flag: try: with open("data-{model}/FIM.dat".format(model=model_name), 'rb') as f: FIM = pickle.load(f) policy.set_FIM(FIM) except FileNotFoundError: with open("data-{model}/nonD_FIM.dat".format(model=model_name), 'rb') as f: FIM = pickle.load(f) policy.set_FIM(FIM) print("Loaded previous checkpoint at step {step}.".format( step=last_checkpoint)) # Run forever. episodes = 1001 for step in range(episodes): # MiniGrid has a QT5 renderer which is pretty cool. env.render('human') time.sleep(0.01) # Run an episode. (states, actions, discounted_rewards) = network.run_episode(env, policy, episode_len) avg_reward += np.mean(discounted_rewards) if step % 100 == 0: print('Average reward @ episode {}: {}'.format( step + int(last_checkpoint), avg_reward / 100)) if step != 0: test_avg_reward.write(str(avg_reward / 100) + "\n") avg_reward = 0.0
def main(): args = parser.parse_args() env_name = args.env_name input_file = args.input_file checkpoint_file = args.resume test_only = args.test_only seed = args.seed no_gpu = args.no_gpu dir_name = args.dir_name visualize = args.visualize n_test_steps = args.n_test_steps log_perf_file = args.log_perf_file min_distance = args.min_distance max_distance = args.max_distance threshold = args.threshold y_range = args.y_range n_training_samples = args.n_training_samples start_index = args.start_index exp_name = args.exp_name batch_size = args.batch_size learning_rate = args.learning_rate n_epochs = args.n_epochs # Specific to Humanoid - Pybullet if visualize and env_name == 'HumanoidBulletEnv-v0': spec = gym.envs.registry.env_specs[env_name] class_ = gym.envs.registration.load(spec._entry_point) env = class_(**{**spec._kwargs}, **{'render': True}) else: env = gym.make(env_name) set_global_seed(seed) env.seed(seed) input_shape = env.observation_space.shape[0] + 3 output_shape = env.action_space.shape[0] net = Policy(input_shape, output_shape) if not no_gpu: net = net.cuda() optimizer = Adam(net.parameters(), lr=learning_rate) criterion = nn.MSELoss() epochs = 0 if checkpoint_file: epochs, net, optimizer = load_checkpoint(checkpoint_file, net, optimizer) if not checkpoint_file and test_only: print('ERROR: You have not entered a checkpoint file.') return if not test_only: if not os.path.isfile(input_file): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), input_file) training_file = open(input_file, 'rb') old_states = [] norms = [] goals = [] actions = [] n_samples = -1 while n_samples - start_index < n_training_samples: try: old_s, old_g, new_s, new_g, action = pickle.load(training_file) n_samples += 1 if n_samples < start_index: continue old_states.append(np.squeeze(np.array(old_s))) norms.append( find_norm(np.squeeze(np.array(new_g) - np.array(old_g)))) goals.append( preprocess_goal( np.squeeze(np.array(new_g) - np.array(old_g)))) actions.append(np.squeeze(np.array(action))) except (EOFError, ValueError): break old_states = np.array(old_states) norms = np.array(norms) goals = np.array(goals) actions = np.array(actions) normalization_factors = { 'state': [old_states.mean(axis=0), old_states.std(axis=0)], 'distance_per_step': [norms.mean(axis=0), norms.std(axis=0)] } n_file = open(env_name + '_normalization_factors.pkl', 'wb') pickle.dump(normalization_factors, n_file) n_file.close() old_states = normalize(old_states, env_name + '_normalization_factors.pkl', 'state') # Summary writer for tensorboardX writer = {} writer['writer'] = SummaryWriter() # Split data into training and validation indices = np.arange(old_states.shape[0]) shuffle(indices) val_data = np.concatenate( (old_states[indices[:int(old_states.shape[0] / 5)]], goals[indices[:int(old_states.shape[0] / 5)]]), axis=1) val_labels = actions[indices[:int(old_states.shape[0] / 5)]] training_data = np.concatenate( (old_states[indices[int(old_states.shape[0] / 5):]], goals[indices[int(old_states.shape[0] / 5):]]), axis=1) training_labels = actions[indices[int(old_states.shape[0] / 5):]] del old_states, norms, goals, actions, indices checkpoint_dir = os.path.join(env_name, 'naive_gcp_checkpoints') if dir_name: checkpoint_dir = os.path.join(checkpoint_dir, dir_name) prepare_dir(checkpoint_dir) for e in range(epochs, n_epochs): ep_loss = [] # Train network for i in range(int(len(training_data) / batch_size) + 1): inp = training_data[batch_size * i:batch_size * (i + 1)] out = net( convert_to_variable(inp, grad=False, gpu=(not no_gpu))) target = training_labels[batch_size * i:batch_size * (i + 1)] target = convert_to_variable(np.array(target), grad=False, gpu=(not no_gpu)) loss = criterion(out, target) optimizer.zero_grad() ep_loss.append(loss.item()) loss.backward() optimizer.step() # Validation val_loss = [] for i in range(int(len(val_data) / batch_size) + 1): inp = val_data[batch_size * i:batch_size * (i + 1)] out = net( convert_to_variable(inp, grad=False, gpu=(not no_gpu))) target = val_labels[batch_size * i:batch_size * (i + 1)] target = convert_to_variable(np.array(target), grad=False, gpu=(not no_gpu)) loss = criterion(out, target) val_loss.append(loss.item()) writer['iter'] = e + 1 writer['writer'].add_scalar('data/val_loss', np.array(val_loss).mean(), e + 1) writer['writer'].add_scalar('data/training_loss', np.array(ep_loss).mean(), e + 1) save_checkpoint( { 'epochs': (e + 1), 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict() }, filename=os.path.join(checkpoint_dir, str(e + 1) + '.pth.tar')) print('Epoch:', e + 1) print('Training loss:', np.array(ep_loss).mean()) print('Val loss:', np.array(val_loss).mean()) print('') # Now we use the trained net to see how the agent reaches a different # waypoint from the current one. success = 0 failure = 0 closest_distances = [] time_to_closest_distances = [] f = open(env_name + '_normalization_factors.pkl', 'rb') normalization_factors = pickle.load(f) average_distance = normalization_factors['distance_per_step'][0] for i in range(n_test_steps): state = env.reset() if env_name == 'Ant-v2': obs = env.unwrapped.get_body_com('torso') target_obs = [ obs[0] + np.random.uniform(min_distance, max_distance), obs[1] + np.random.uniform(-y_range, y_range), obs[2] ] target_obs = rotate_point(target_obs, env.unwrapped.angle) env.unwrapped.sim.model.body_pos[-1] = target_obs elif env_name == 'MinitaurBulletEnv-v0': obs = env.unwrapped.get_minitaur_position() target_obs = [ obs[0] + np.random.uniform(min_distance, max_distance), obs[1] + np.random.uniform(-y_range, y_range), obs[2] ] target_obs = rotate_point( target_obs, env.unwrapped.get_minitaur_rotation_angle()) env.unwrapped.set_target_position(target_obs) elif env_name == 'HumanoidBulletEnv-v0': obs = env.unwrapped.robot.get_robot_position() target_obs = [ obs[0] + np.random.uniform(min_distance, max_distance), obs[1] + np.random.uniform(-y_range, y_range), obs[2] ] target_obs = rotate_point(target_obs, env.unwrapped.robot.yaw) env.unwrapped.robot.set_target_position(target_obs[0], target_obs[1]) steps = 0 done = False closest_d = distance(obs, target_obs) closest_t = 0 while distance(obs, target_obs) > threshold and not done: goal = preprocess_goal(target_obs - obs) state = normalize(np.array(state), env_name + '_normalization_factors.pkl') inp = np.concatenate([np.squeeze(state), goal]) inp = convert_to_variable(inp, grad=False, gpu=(not no_gpu)) action = net(inp).cpu().detach().numpy() state, _, done, _ = env.step(action) steps += 1 if env_name == 'MinitaurBulletEnv-v0': obs = env.unwrapped.get_minitaur_position() elif env_name == 'HumanoidBulletEnv-v0': obs = env.unwrapped.robot.get_robot_position() if distance(obs, target_obs) < closest_d: closest_d = distance(obs, target_obs) closest_t = steps if visualize: env.render() if distance(obs, target_obs) <= threshold: success += 1 elif done: failure += 1 if visualize: time.sleep(2) closest_distances.append(closest_d) time_to_closest_distances.append(closest_t) print('Successes: %d, Failures: %d, ' 'Closest distance: %f, Time to closest distance: %d' % (success, failure, np.mean(closest_distances), np.mean(time_to_closest_distances))) if log_perf_file: f = open(log_perf_file, 'a+') f.write(exp_name + ':Seed-' + str(seed) + ',Success-' + str(success) + ',Failure-' + str(failure) + ',Closest_distance-' + str(closest_distances) + ',Time_to_closest_distance-' + str(time_to_closest_distances) + '\n') f.close()
def run(episodes=1600, episode_len=50, inner_size=64, lr=0.001, env_name='MiniGrid-Empty-8x8-v0', training=False, goal_pos=1): obs_size = 7 * 7 # MiniGrid uses a 7x7 window of visibility. act_size = 7 # Seven possible actions (turn left, right, forward, pickup, drop, etc.) avg_reward = 0.0 # For tracking average regard per episode. first_write_flag = True # Need this due to a weird behavior of the library need_diag_FIM = True # Avoid the FIM calculus if not required need_nondiag_FIM = False # Same as above but with non diagonal FIM model_name = "EWC_model_diag_FIM_3_tasks" # Retrieve the correct model if it exists EWC_flag = True # If true, uses ewc_loss if not EWC_flag: need_nondiag_FIM = False need_diag_FIM = False # Check whether the data directory exists and, if not, create it with all the necessary stuff. if not os.path.exists("data-{model}/".format(model=model_name)): print("Task 2 data directory created.") os.makedirs("data-{model}/".format(model=model_name)) output_reward = open("data-{model}/reward.txt".format(model=model_name), 'a+') output_avg = open("data-{model}/avg_reward.txt".format(model=model_name), 'a+') output_loss = open("data-{model}/loss.txt".format(model=model_name), 'a+') # Setup OpenAI Gym environment for guessing game. env = gym.make(env_name) if goal_pos == 2: env.set_posX(2) env.set_posY(5) elif goal_pos == 3: env.set_posX(5) env.set_posY(2) # Check the model directory last_checkpoint = utils.search_last_model('torch_models/', model_name) # Instantiate a policy network policy = Policy(obs_size=obs_size, act_size=act_size, inner_size=inner_size) # If there's a previous checkpoint, load this instead of using a new one. if os.listdir('torch_models/{model}/'.format(model=model_name)): policy.load_state_dict( torch.load("torch_models/{model}/{model}-{step}.pth".format( model=model_name, step=last_checkpoint))) if need_diag_FIM and EWC_flag: with open("data-{model}/FIM.dat".format(model=model_name), 'rb') as f: FIM = pickle.load(f) policy.set_FIM(FIM) elif need_nondiag_FIM and EWC_flag: with open("data-{model}/nonD_FIM.dat".format(model=model_name), 'rb') as f: FIM = pickle.load(f) policy.set_FIM(FIM) print("Loaded previous checkpoint at step {step}.".format( step=last_checkpoint)) else: print("Created new policy agent.") # Use the Adam optimizer. optimizer = torch.optim.Adam(params=policy.parameters(), lr=lr) try: for step in range(episodes): # MiniGrid has a QT5 renderer which is pretty cool. env.render('human') time.sleep(0.01) # Run an episode. (states, actions, discounted_rewards) = network.run_episode(env, policy, episode_len) # From list to np.array, then save every element in the array discounted_rewards_np = np.asarray(discounted_rewards) if step % 100 == 0 and training: output_reward.write(str(discounted_rewards_np) + "\n") avg_reward += np.mean(discounted_rewards) if step % 100 == 0: print('Average reward @ episode {}: {}'.format( step + int(last_checkpoint), avg_reward / 100)) if not first_write_flag and training: output_avg.write(str(avg_reward / 100) + "\n") else: first_write_flag = False avg_reward = 0.0 # Save the model every 1000 steps if step % 500 == 0 and training: torch.save( policy.state_dict(), 'torch_models/{model}/{model}-{step}.pth'.format( model=model_name, step=step + int(last_checkpoint))) print("Checkpoint saved.") # Repeat each action, and backpropagate discounted # rewards. This can probably be batched for efficiency with a # memoryless agent... if training: optimizer.zero_grad() episode_loss = [] for (step, a) in enumerate(actions): logits = policy(states[step]) dist = Categorical(logits=logits) if EWC_flag: loss = -dist.log_prob(actions[step]) * discounted_rewards[ step] + ewc.ewc_loss(policy, 2) else: loss = -dist.log_prob( actions[step]) * discounted_rewards[step] loss.backward() episode_loss.append(loss.data[0]) current_loss = sum([x for x in episode_loss]) / episode_len if training: optimizer.step() output_loss.write(str(float(current_loss)) + "\n") except KeyboardInterrupt: if training: print("Training ended.") else: print("Simulation ended.") # Now estimate the diagonal FIM. if need_diag_FIM: utils.diagonal_FIM(policy, env, episode_len, model_name) elif need_nondiag_FIM: utils.non_diagonal_FIM(policy, env, episode_len, model_name)
class PPOTrainer: def __init__(self, args): tmp_env = make_env(args.env) self.obs_shape = tmp_env.observation_space.shape self.num_actions = tmp_env.action_space.n self.c_in = self.obs_shape[0] del tmp_env self.horizon = args.horizon self.eta = args.eta self.epoch = args.epoch self.batch_size = args.batch * args.actors self.gamma = args.gamma self.lam = args.lam self.num_actors = args.actors self.eps = args.eps self.num_iter = ( args.epoch * args.actors * args.horizon ) // self.batch_size # how many times to run SGD on the buffer self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.queues = [Queue() for i in range(self.num_actors)] self.barrier = Queue( ) # This is used as a waiting mechanism, to wait for all the agents to env.step() self.score_channel = Queue() # these are shmem np.arrays self.state, self.reward, self.finished = self.init_shared() self.workers = [ Worker(i, args.env, self.queues[i], self.barrier, self.state, self.reward, self.finished, self.score_channel) for i in range(self.num_actors) ] self.start_workers() self.model = Policy(self.c_in, self.num_actions).to(self.device) self.optim = torch.optim.Adam(self.model.parameters(), lr=self.eta) # used for logging and graphing self.stat = { 'scores': [], 'steps': [], 'clip_losses': [], 'value_losses': [], 'entropies': [] } def init_shared(self): state_shape = (self.num_actors, *self.obs_shape) scalar_shape = (self.num_actors, 1) state = np.empty(state_shape, dtype=np.float32) state = RawArray(c_float, state.reshape(-1)) state = np.frombuffer(state, c_float).reshape(state_shape) reward = np.empty(scalar_shape, dtype=np.float32) reward = RawArray(c_float, reward.reshape(-1)) reward = np.frombuffer(reward, c_float).reshape(scalar_shape) finished = np.empty(scalar_shape, dtype=np.float32) finished = RawArray(c_float, finished.reshape(-1)) finished = np.frombuffer(finished, c_float).reshape(scalar_shape) return state, reward, finished def start_workers(self): for worker in self.workers: worker.start() def initialize_state(self): for i in range(self.num_actors): self.queues[i].put(-1) self.wait_for_agents() @timing_wrapper def broadcast_actions(self, actions): actions = actions.cpu().numpy() for i in range(self.num_actors): self.queues[i].put(actions[i]) self.wait_for_agents() next_state = torch.tensor(self.state).to(self.device) reward = torch.tensor(self.reward).to(self.device) done = torch.tensor(self.finished).to(self.device) return next_state, reward, done def wait_for_agents(self): for i in range(self.num_actors): self.barrier.get() def setup_scheduler(self, T_max): num_steps = T_max // (self.horizon * self.num_actors) self.scheduler = torch.optim.lr_scheduler.LambdaLR( self.optim, lambda x: max(1 - x / num_steps, 0)) @timing_wrapper def train(self, T_max, graph_name=None): self.setup_scheduler(T_max) global_step = 0 self.initialize_state() state = torch.tensor(self.state).to(self.device) while global_step < T_max: states = [] actions = [] rewards = [] finished = [] sampled_lps = [] # sampled log probabilities values = [] time_start = time.time() duration_fwd = 0 with torch.no_grad(): for t in range(self.horizon): global_step += self.num_actors logit, value = self.model(state) prob = torch.softmax(logit, dim=1) log_prob = torch.log_softmax(logit, dim=1) action = prob.multinomial(1) sampled_lp = log_prob.gather(1, action) (next_state, reward, done), duration_brdcst = self.broadcast_actions(action) # appending to buffer states.append(state) actions.append(action) rewards.append(reward) finished.append(done) sampled_lps.append(sampled_lp) values.append(value) state = next_state duration_fwd += duration_brdcst _, V = self.model(next_state) values.append(V) time_forward = time.time() # GAE estimation GAEs, duration_GAE = self.compute_GAE(rewards, finished, values) duration_backward = self.run_gradient_descent( states, actions, sampled_lps, values, GAEs) time_end = time.time() total_duration = time_end - time_start percent_broadcast = duration_fwd / total_duration * 100 percent_forward = (time_forward - time_start) / total_duration * 100 percent_GAE = duration_GAE / total_duration * 100 percent_backward = duration_backward / total_duration * 100 # print(f"<Time> Total: {total_duration:.2f} | forward: {percent_forward:.2f}% (broadcast {percent_broadcast:.2f}%) | GAE: {percent_GAE:.2f}% | backward: {percent_backward:.2f}%") if global_step % (self.num_actors * self.horizon * 30) == 0: while not self.score_channel.empty(): score, step = self.score_channel.get() self.stat['scores'].append(score) self.stat['steps'].append(step) now = datetime.datetime.now().strftime("%H:%M") print( f"Step {global_step} | Mean of last 10 scores: {np.mean(self.stat['scores'][-10:]):.2f} | Time: {now}" ) if graph_name is not None: plot(global_step, self.stat, graph_name) # Finish plot(global_step, self.stat, graph_name) @timing_wrapper def compute_GAE(self, rewards, finished, values): GAEs = [] advantage = 0 for i in reversed(range(self.horizon)): td_error = rewards[i] + ( 1 - finished[i]) * self.gamma * values[i + 1] - values[i] advantage = td_error + ( 1 - finished[i]) * self.gamma * self.lam * advantage GAEs.append(advantage) GAEs = torch.cat(GAEs[::-1]).to(self.device) # NOTE: Below is currently not in use because I don't know how to incorporate the 'finished' tensor into account # NOTE: This version is much, much faster than the python-looped version above # NOTE: But in terms of the total time taken, it doesn't make much of a difference. (~2% compared to ~0.05%) # rewards = torch.stack(rewards) # finished = torch.stack(finished) # values = torch.stack(values) # td_error = rewards + (1 - finished) * self.gamma * values[1:] - values[:-1] # td_error = td_error.cpu() # GAEs = scipy.signal.lfilter([1], [1, -self.gamma * self.lam], td_error.flip(dims=(0,)), axis=0) # GAEs = np.flip(GAEs, axis=0) # flip it back again # GAEs = GAEs.reshape(-1, GAEs.shape[-1]) # (horizon, num_actors, 1) --> (horizon * num_actors, 1) # GAEs = torch.tensor(GAEs).float().to(self.device) return GAEs @timing_wrapper def run_gradient_descent(self, states, actions, sampled_lps, values, GAEs): states = torch.cat(states) actions = torch.cat(actions) sampled_lps = torch.cat(sampled_lps) values = torch.cat(values[:-1]) targets = GAEs + values self.scheduler.step() # Running SGD for K epochs for it in range(self.num_iter): # Batch indices idx = np.random.randint(0, self.horizon * self.num_actors, self.batch_size) state = states[idx] action = actions[idx] sampled_lp = sampled_lps[idx] GAE = GAEs[idx] value = values[idx] target = targets[idx] # Normalize advantages GAE = (GAE - GAE.mean()) / (GAE.std() + 1e-8) logit_new, value_new = self.model(state) # Clipped values are needed because sometimes values can unexpectedly get really big clipped_value_new = value + torch.clamp(value_new - value, -self.eps, self.eps) # Calculating policy loss prob_new = torch.softmax(logit_new, dim=1) lp_new = torch.log_softmax(logit_new, dim=1) entropy = -(prob_new * lp_new).sum(1).mean() sampled_lp_new = lp_new.gather(1, action) ratio = torch.exp(sampled_lp_new - sampled_lp) surr1 = ratio * GAE surr2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * GAE clip_loss = torch.min(surr1, surr2).mean() # Calculating value loss value_loss1 = (value_new - target).pow(2) value_loss2 = (clipped_value_new - target).pow(2) value_loss = 0.5 * torch.max(value_loss1, value_loss2).mean() final_loss = -clip_loss + value_loss - 0.01 * entropy self.optim.zero_grad() final_loss.backward() # total_norm = 0 # for p in self.model.parameters(): # param_norm = p.grad.data.norm(2) # total_norm += param_norm.item() ** 2 # total_norm = total_norm ** (1. / 2) # print(total_norm) torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1) self.optim.step() # graphing self.stat['clip_losses'].append(clip_loss.item()) self.stat['value_losses'].append(value_loss.item()) self.stat['entropies'].append(entropy.item())