def main( experiment='dev', env_name='Particles2D-v1', adapt_lr=0.1, meta_lr=0.01, adapt_steps=1, num_iterations=200, meta_bsz=20, adapt_bsz=20, tau=1.00, gamma=0.99, num_workers=2, seed=42, ): random.seed(seed) np.random.seed(seed) th.manual_seed(seed) def make_env(): return gym.make(env_name) env = l2l.gym.AsyncVectorEnv([make_env for _ in range(num_workers)]) env.seed(seed) env = ch.envs.Torch(env) policy = DiagNormalPolicy(env.state_size, env.action_size) meta_learner = l2l.algorithms.MetaSGD(policy, lr=meta_lr) baseline = LinearValue(env.state_size, env.action_size) opt = optim.Adam(policy.parameters(), lr=meta_lr) all_rewards = [] for iteration in range(num_iterations): iteration_loss = 0.0 iteration_reward = 0.0 for task_config in tqdm( env.sample_tasks(meta_bsz)): # Samples a new config learner = meta_learner.clone() env.set_task(task_config) env.reset() task = ch.envs.Runner(env) # Fast Adapt for step in range(adapt_steps): train_episodes = task.run(learner, episodes=adapt_bsz) loss = maml_a2c_loss(train_episodes, learner, baseline, gamma, tau) learner.adapt(loss) # Compute Validation Loss valid_episodes = task.run(learner, episodes=adapt_bsz) loss = maml_a2c_loss(valid_episodes, learner, baseline, gamma, tau) iteration_loss += loss iteration_reward += valid_episodes.reward().sum().item( ) / adapt_bsz # Print statistics print('\nIteration', iteration) adaptation_reward = iteration_reward / meta_bsz print('adaptation_reward', adaptation_reward) all_rewards.append(adaptation_reward) adaptation_loss = iteration_loss / meta_bsz print('adaptation_loss', adaptation_loss.item()) opt.zero_grad() adaptation_loss.backward() opt.step()
def main( env_name='AntDirection-v1', adapt_lr=0.1, meta_lr=1.0, adapt_steps=1, num_iterations=1000, meta_bsz=40, adapt_bsz=20, tau=1.00, gamma=0.99, seed=42, num_workers=2, cuda=0, ): cuda = bool(cuda) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if cuda: torch.cuda.manual_seed(seed) def make_env(): env = gym.make(env_name) env = ch.envs.ActionSpaceScaler(env) return env env = l2l.gym.AsyncVectorEnv([make_env for _ in range(num_workers)]) env.seed(seed) env.set_task(env.sample_tasks(1)[0]) env = ch.envs.Torch(env) policy = DiagNormalPolicy(env.state_size, env.action_size) if cuda: policy.to('cuda') baseline = LinearValue(env.state_size, env.action_size) for iteration in range(num_iterations): iteration_reward = 0.0 iteration_replays = [] iteration_policies = [] for task_config in tqdm(env.sample_tasks(meta_bsz), leave=False, desc='Data'): # Samples a new config clone = deepcopy(policy) env.set_task(task_config) env.reset() task = ch.envs.Runner(env) task_replay = [] # Fast Adapt for step in range(adapt_steps): train_episodes = task.run(clone, episodes=adapt_bsz) clone = fast_adapt_a2c(clone, train_episodes, adapt_lr, baseline, gamma, tau, first_order=True) task_replay.append(train_episodes) # Compute Validation Loss valid_episodes = task.run(clone, episodes=adapt_bsz) task_replay.append(valid_episodes) iteration_reward += valid_episodes.reward().sum().item( ) / adapt_bsz iteration_replays.append(task_replay) iteration_policies.append(clone) # Print statistics print('\nIteration', iteration) adaptation_reward = iteration_reward / meta_bsz print('adaptation_reward', adaptation_reward) # TRPO meta-optimization backtrack_factor = 0.5 ls_max_steps = 15 max_kl = 0.01 if cuda: policy.to('cuda', non_blocking=True) baseline.to('cuda', non_blocking=True) iteration_replays = [[ r.to('cuda', non_blocking=True) for r in task_replays ] for task_replays in iteration_replays] # Compute CG step direction old_loss, old_kl = meta_surrogate_loss(iteration_replays, iteration_policies, policy, baseline, tau, gamma, adapt_lr) grad = autograd.grad(old_loss, policy.parameters(), retain_graph=True) grad = parameters_to_vector([g.detach() for g in grad]) Fvp = trpo.hessian_vector_product(old_kl, policy.parameters()) step = trpo.conjugate_gradient(Fvp, grad) shs = 0.5 * torch.dot(step, Fvp(step)) lagrange_multiplier = torch.sqrt(shs / max_kl) step = step / lagrange_multiplier step_ = [torch.zeros_like(p.data) for p in policy.parameters()] vector_to_parameters(step, step_) step = step_ del old_kl, Fvp, grad old_loss.detach_() # Line-search for ls_step in range(ls_max_steps): stepsize = backtrack_factor**ls_step * meta_lr clone = deepcopy(policy) for p, u in zip(clone.parameters(), step): p.data.add_(-stepsize, u.data) new_loss, kl = meta_surrogate_loss(iteration_replays, iteration_policies, clone, baseline, tau, gamma, adapt_lr) if new_loss < old_loss and kl < max_kl: for p, u in zip(policy.parameters(), step): p.data.add_(-stepsize, u.data) break
def main( benchmark=ML10, # Choose between ML1, ML10, ML45 adapt_lr=0.1, meta_lr=0.1, adapt_steps=1, num_iterations=1000, meta_bsz=20, adapt_bsz=10, # Number of episodes to sample per task tau=1.00, gamma=0.99, seed=42, num_workers=10, # Currently tasks are distributed evenly so adapt_bsz should be divisible by num_workers cuda=0): env = make_env(benchmark, seed, num_workers) cuda = bool(cuda) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if cuda: torch.cuda.manual_seed(seed) policy = DiagNormalPolicy(env.state_size, env.action_size, activation='tanh') if cuda: policy.to('cuda') baseline = LinearValue(env.state_size, env.action_size) for iteration in range(num_iterations): iteration_reward = 0.0 iteration_replays = [] iteration_policies = [] for task_config in tqdm(env.sample_tasks(meta_bsz), leave=False, desc='Data'): # Samples a new config clone = deepcopy(policy) env.set_task(task_config) env.reset() task = ch.envs.Runner(env) task_replay = [] # Fast Adapt for step in range(adapt_steps): train_episodes = task.run(clone, episodes=adapt_bsz) clone = fast_adapt_a2c(clone, train_episodes, adapt_lr, baseline, gamma, tau, first_order=True) task_replay.append(train_episodes) # Compute Validation Loss valid_episodes = task.run(clone, episodes=adapt_bsz) task_replay.append(valid_episodes) iteration_reward += valid_episodes.reward().sum().item( ) / adapt_bsz iteration_replays.append(task_replay) iteration_policies.append(clone) # Print statistics print('\nIteration', iteration) validation_reward = iteration_reward / meta_bsz print('validation_reward', validation_reward) # TRPO meta-optimization backtrack_factor = 0.5 ls_max_steps = 15 max_kl = 0.01 if cuda: policy.to('cuda', non_blocking=True) baseline.to('cuda', non_blocking=True) iteration_replays = [[ r.to('cuda', non_blocking=True) for r in task_replays ] for task_replays in iteration_replays] # Compute CG step direction old_loss, old_kl = meta_surrogate_loss(iteration_replays, iteration_policies, policy, baseline, tau, gamma, adapt_lr) grad = autograd.grad(old_loss, policy.parameters(), retain_graph=True) grad = parameters_to_vector([g.detach() for g in grad]) Fvp = trpo.hessian_vector_product(old_kl, policy.parameters()) step = trpo.conjugate_gradient(Fvp, grad) shs = 0.5 * torch.dot(step, Fvp(step)) lagrange_multiplier = torch.sqrt(shs / max_kl) step = step / lagrange_multiplier step_ = [torch.zeros_like(p.data) for p in policy.parameters()] vector_to_parameters(step, step_) step = step_ del old_kl, Fvp, grad old_loss.detach_() # Line-search for ls_step in range(ls_max_steps): stepsize = backtrack_factor**ls_step * meta_lr clone = deepcopy(policy) for p, u in zip(clone.parameters(), step): p.data.add_(-stepsize, u.data) new_loss, kl = meta_surrogate_loss(iteration_replays, iteration_policies, clone, baseline, tau, gamma, adapt_lr) if new_loss < old_loss and kl < max_kl: for p, u in zip(policy.parameters(), step): p.data.add_(-stepsize, u.data) break # Evaluate on a set of unseen tasks evaluate(benchmark, policy, baseline, adapt_lr, gamma, tau, num_workers, seed)
def main( env_name='AntDirection-v1', adapt_lr=0.1, meta_lr=3e-4, adapt_steps=3, num_iterations=1000, meta_bsz=40, adapt_bsz=20, ppo_clip=0.3, ppo_steps=5, tau=1.00, gamma=0.99, eta=0.0005, adaptive_penalty=False, kl_target=0.01, num_workers=4, seed=421, ): random.seed(seed) np.random.seed(seed) th.manual_seed(seed) def make_env(): env = gym.make(env_name) env = ch.envs.ActionSpaceScaler(env) return env env = l2l.gym.AsyncVectorEnv([make_env for _ in range(num_workers)]) env.seed(seed) env = ch.envs.ActionSpaceScaler(env) env = ch.envs.Torch(env) policy = DiagNormalPolicy(input_size=env.state_size, output_size=env.action_size, hiddens=[64, 64], activation='tanh') meta_learner = l2l.algorithms.MAML(policy, lr=meta_lr) baseline = LinearValue(env.state_size, env.action_size) opt = optim.Adam(meta_learner.parameters(), lr=meta_lr) for iteration in range(num_iterations): iteration_reward = 0.0 iteration_replays = [] iteration_policies = [] # Sample Trajectories for task_config in tqdm(env.sample_tasks(meta_bsz), leave=False, desc='Data'): clone = deepcopy(meta_learner) env.set_task(task_config) env.reset() task = ch.envs.Runner(env) task_replay = [] task_policies = [] # Fast Adapt for step in range(adapt_steps): for p in clone.parameters(): p.detach_().requires_grad_() task_policies.append(deepcopy(clone)) train_episodes = task.run(clone, episodes=adapt_bsz) clone = fast_adapt_a2c(clone, train_episodes, adapt_lr, baseline, gamma, tau, first_order=True) task_replay.append(train_episodes) # Compute Validation Loss for p in clone.parameters(): p.detach_().requires_grad_() task_policies.append(deepcopy(clone)) valid_episodes = task.run(clone, episodes=adapt_bsz) task_replay.append(valid_episodes) iteration_reward += valid_episodes.reward().sum().item( ) / adapt_bsz iteration_replays.append(task_replay) iteration_policies.append(task_policies) # Print statistics print('\nIteration', iteration) adaptation_reward = iteration_reward / meta_bsz print('adaptation_reward', adaptation_reward) # ProMP meta-optimization for ppo_step in tqdm(range(ppo_steps), leave=False, desc='Optim'): promp_loss = 0.0 kl_total = 0.0 for task_replays, old_policies in zip(iteration_replays, iteration_policies): new_policy = meta_learner.clone() states = task_replays[0].state() actions = task_replays[0].action() rewards = task_replays[0].reward() dones = task_replays[0].done() next_states = task_replays[0].next_state() old_policy = old_policies[0] (old_density, new_density, old_log_probs, new_log_probs) = precompute_quantities( states, actions, old_policy, new_policy) advantages = compute_advantages(baseline, tau, gamma, rewards, dones, states, next_states) advantages = ch.normalize(advantages).detach() for step in range(adapt_steps): # Compute KL penalty kl_pen = kl_divergence(old_density, new_density).mean() kl_total += kl_pen.item() # Update the clone surr_loss = trpo.policy_loss(new_log_probs, old_log_probs, advantages) new_policy.adapt(surr_loss) # Move to next adaptation step states = task_replays[step + 1].state() actions = task_replays[step + 1].action() rewards = task_replays[step + 1].reward() dones = task_replays[step + 1].done() next_states = task_replays[step + 1].next_state() old_policy = old_policies[step + 1] (old_density, new_density, old_log_probs, new_log_probs) = precompute_quantities( states, actions, old_policy, new_policy) # Compute clip loss advantages = compute_advantages(baseline, tau, gamma, rewards, dones, states, next_states) advantages = ch.normalize(advantages).detach() clip_loss = ppo.policy_loss(new_log_probs, old_log_probs, advantages, clip=ppo_clip) # Combine into ProMP loss promp_loss += clip_loss + eta * kl_pen kl_total /= meta_bsz * adapt_steps promp_loss /= meta_bsz * adapt_steps opt.zero_grad() promp_loss.backward(retain_graph=True) opt.step() # Adapt KL penalty based on desired target if adaptive_penalty: if kl_total < kl_target / 1.5: eta /= 2.0 elif kl_total > kl_target * 1.5: eta *= 2.0
def main( experiment='dev', env_name='2DNavigation-v0', adapt_lr=0.1, meta_lr=0.01, adapt_steps=1, num_iterations=20, meta_bsz=10, adapt_bsz=10, tau=1.00, gamma=0.99, num_workers=1, seed=42, ): random.seed(seed) np.random.seed(seed) th.manual_seed(seed) def make_env(): return gym.make(env_name) env = l2l.gym.AsyncVectorEnv([make_env for _ in range(num_workers)]) env.seed(seed) env = ch.envs.Torch(env) policy = DiagNormalPolicy(env.state_size, env.action_size) meta_learner = l2l.MAML(policy, lr=meta_lr) baseline = LinearValue(env.state_size, env.action_size) opt = optim.Adam(meta_learner.parameters(), lr=meta_lr) all_rewards = [] for iteration in range(num_iterations): iteration_reward = 0.0 iteration_replays = [] iteration_policies = [] policy.to('cpu') baseline.to('cpu') for task_config in tqdm(env.sample_tasks(meta_bsz), leave=False, desc='Data'): # Samples a new config learner = meta_learner.clone() env.reset_task(task_config) env.reset() task = ch.envs.Runner(env) task_replay = [] # Fast Adapt for step in range(adapt_steps): train_episodes = task.run(learner, episodes=adapt_bsz) learner = fast_adapt_a2c(learner, train_episodes, adapt_lr, baseline, gamma, tau, first_order=True) task_replay.append(train_episodes) # Compute Validation Loss valid_episodes = task.run(learner, episodes=adapt_bsz) task_replay.append(valid_episodes) iteration_reward += valid_episodes.reward().sum().item( ) / adapt_bsz iteration_replays.append(task_replay) iteration_policies.append(learner) # Print statistics print('\nIteration', iteration) adaptation_reward = iteration_reward / meta_bsz all_rewards.append(adaptation_reward) print('adaptation_reward', adaptation_reward) # PPO meta-optimization for ppo_step in tqdm(range(10), leave=False, desc='Optim'): ppo_loss = 0.0 for task_replays, old_policy in zip(iteration_replays, iteration_policies): train_replays = task_replays[:-1] valid_replay = task_replays[-1] # Fast adapt new policy, starting from the current init new_policy = meta_learner.clone() for train_episodes in train_replays: new_policy = fast_adapt_a2c(new_policy, train_episodes, adapt_lr, baseline, gamma, tau) # Compute PPO loss between old and new clones states = valid_replay.state() actions = valid_replay.action() rewards = valid_replay.reward() dones = valid_replay.done() next_states = valid_replay.next_state() old_log_probs = old_policy.log_prob(states, actions).detach() new_log_probs = new_policy.log_prob(states, actions) advantages = compute_advantages(baseline, tau, gamma, rewards, dones, states, next_states) advantages = ch.normalize(advantages).detach() ppo_loss += ppo.policy_loss(new_log_probs, old_log_probs, advantages, clip=0.1) ppo_loss /= meta_bsz opt.zero_grad() ppo_loss.backward() opt.step()