def measure_change_through_time(path, env_name, policy, rep_params): env = make_env(env_name, 1, rep_params['seed'], max_path_length=rep_params['max_path_length']) global metrics metrics = ['CCA'] sanity_task = env.sample_tasks(1) with torch.no_grad(): env.set_task(sanity_task[0]) env.seed(rep_params['seed']) env.reset() env_task = Runner(env) sanity_ep = env_task.run(policy, episodes=1) init_change_m = defaultdict(list) init_change_v = defaultdict(list) adapt_change_m = defaultdict(list) adapt_change_v = defaultdict(list) checkpoints = path + f'/model_checkpoints/' i = 0 file_list = os.listdir(checkpoints) file_list = [file for file in file_list if 'baseline' not in file] models_list = {} for file in file_list: n_file = file.split('_')[-1] n_file = n_file.split('.')[0] n_file = int(n_file) models_list[n_file] = f'model_{n_file}.pt' prev_policy = policy for key in sorted(models_list.keys()): model_chckpnt = models_list[key] if i > 40: break i += 1 print(f'Loading {model_chckpnt} ...') chckpnt_policy = DiagNormalPolicy(9, 4) chckpnt_policy.load_state_dict(torch.load(os.path.join(checkpoints, model_chckpnt))) chckpnt_policy = MAML(chckpnt_policy, lr=rep_params['inner_lr']) mean, variance = episode_mean_var(sanity_ep, policy, chckpnt_policy, layer=6) a_mean, a_variance = episode_mean_var(sanity_ep, prev_policy, chckpnt_policy, layer=6) init_change_m['CCA'] += [mean['CCA']] init_change_v['CCA'] += [variance['CCA']] adapt_change_m['CCA'] += [a_mean['CCA']] adapt_change_v['CCA'] += [a_variance['CCA']] prev_policy = chckpnt_policy for metric in metrics: plot_sim_across_steps(init_change_m[metric], init_change_v[metric], metric=metric, title='Similarity between init and adapted (in %)') for metric in metrics: difference = [1 - x for x in adapt_change_m[metric]] plot_sim_across_steps(difference, adapt_change_v[metric], metric=metric, title='Representation difference after each step (in %)')
def run(self, env, device): baseline = ch.models.robotics.LinearValue(env.state_size, env.action_size) policy = DiagNormalPolicy(env.state_size, env.action_size) self.log_model(policy, device, input_shape=(1, env.state_size)) t = trange(self.params['num_iterations'], desc='Iteration', position=0) try: for iteration in t: iter_reward = 0.0 task_list = env.sample_tasks(self.params['batch_size']) for task_i in trange(len(task_list), leave=False, desc='Task', position=0): task = task_list[task_i] env.set_task(task) env.reset() task = Runner(env) episodes = task.run(policy, episodes=params['n_episodes']) task_reward = episodes.reward().sum().item( ) / params['n_episodes'] iter_reward += task_reward # Log average_return = iter_reward / self.params['batch_size'] metrics = {'average_return': average_return} t.set_postfix(metrics) self.log_metrics(metrics) if iteration % self.params['save_every'] == 0: self.save_model_checkpoint(policy, str(iteration + 1)) self.save_model_checkpoint( baseline, 'baseline_' + str(iteration + 1)) # Support safely manually interrupt training except KeyboardInterrupt: print( '\nManually stopped training! Start evaluation & saving...\n') self.logger['manually_stopped'] = True self.params['num_iterations'] = iteration self.save_model(policy) self.save_model(baseline, name='baseline') self.logger['elapsed_time'] = str(round(t.format_dict['elapsed'], 2)) + ' sec' # Evaluate on new test tasks policy = MAML(policy, lr=self.params['inner_lr']) self.logger['test_reward'] = evaluate_ppo(env_name, policy, baseline, params) self.log_metrics({'test_reward': self.logger['test_reward']}) self.save_logs_to_file()
def run(self, env, device): set_device(device) baseline = ch.models.robotics.LinearValue(env.state_size, env.action_size) policy = DiagNormalPolicy(env.state_size, env.action_size) self.log_model(policy, device, input_shape=(1, env.state_size)) t = trange(self.params['num_iterations'], desc='Iteration', position=0) try: for iteration in t: iter_loss = 0.0 iter_reward = 0.0 # iter_success_per_task = {} iter_replays = [] iter_policies = [] task_list = env.sample_tasks(self.params['meta_batch_size']) for task_i in trange(len(task_list), leave=False, desc='Task', position=0): task = task_list[task_i] # task_id = f'task_{task["task"]}' learner = deepcopy(policy) env.set_task(task) env.reset() task = Runner(env, extra_info=extra_info) # Adapt learner, eval_loss, task_replay, task_rew, task_suc = fast_adapt_trpo( task, learner, baseline, self.params, first_order=True) # Calculate average success rate of support episodes # task_adapt_suc = get_ep_successes(task_replay[0]) / self.params['adapt_batch_size'] # iter_success_per_task[task_id + '_adapt'] = task_adapt_suc # iter_success_per_task[task_id] = task_suc iter_reward += task_rew iter_loss += eval_loss.item() iter_replays.append(task_replay) iter_policies.append(learner) # Log average_return = iter_reward / self.params['meta_batch_size'] average_loss = iter_loss / self.params['meta_batch_size'] metrics = { 'average_return': average_return, 'loss': average_loss } t.set_postfix(metrics) # metrics.update(iter_success_per_task) self.log_metrics(metrics) # Meta-optimize meta_optimize_trpo(self.params, policy, baseline, iter_replays, iter_policies) if iteration % self.params['save_every'] == 0: self.save_model_checkpoint(policy, str(iteration + 1)) self.save_model_checkpoint( baseline, 'baseline_' + str(iteration + 1)) # Support safely manually interrupt training except KeyboardInterrupt: print( '\nManually stopped training! Start evaluation & saving...\n') self.logger['manually_stopped'] = True self.params['num_iterations'] = iteration self.save_model(policy) self.save_model(baseline, name='baseline') self.logger['elapsed_time'] = str(round(t.format_dict['elapsed'], 2)) + ' sec' # Evaluate on new test tasks self.logger['test_reward'] = evaluate_trpo(env_name, policy, baseline, eval_params) self.log_metrics({'test_reward': self.logger['test_reward']}) self.save_logs_to_file()
def run(self, env, device): set_device(device) baseline = ch.models.robotics.LinearValue(env.state_size, env.action_size) policy = DiagNormalPolicy(env.state_size, env.action_size) policy = MAML(policy, lr=self.params['inner_lr']) meta_optimizer = torch.optim.Adam(policy.parameters(), lr=self.params['outer_lr']) self.log_model(policy, device, input_shape=(1, env.state_size)) t = trange(self.params['num_iterations'], desc='Iteration', position=0) try: for iteration in t: meta_optimizer.zero_grad() iter_reward = 0.0 iter_loss = 0.0 task_list = env.sample_tasks(self.params['meta_batch_size']) for task_i in trange(len(task_list), leave=False, desc='Task', position=0): task = task_list[task_i] learner = policy.clone() env.set_task(task) env.reset() task = Runner(env, extra_info=extra_info) # Adapt eval_loss, task_rew, task_suc = fast_adapt_ppo( task, learner, baseline, self.params) iter_reward += task_rew iter_loss += eval_loss # print(f'\tTask {task_i} reward: {task_rew} | Loss : {eval_loss.item()}') # Log average_return = iter_reward / self.params['meta_batch_size'] av_loss = iter_loss / self.params['meta_batch_size'] # print(f'Iter {iteration} average reward: {average_return} | Loss : {av_loss}') metrics = { 'average_return': average_return, 'loss': av_loss.item() } t.set_postfix(metrics) self.log_metrics(metrics) # Meta-optimize: Back-propagate through the accumulated gradients and optimize av_loss.backward() meta_optimizer.step() if iteration % self.params['save_every'] == 0: self.save_model_checkpoint(policy.module, str(iteration + 1)) self.save_model_checkpoint( baseline, 'baseline_' + str(iteration + 1)) # Support safely manually interrupt training except KeyboardInterrupt: print( '\nManually stopped training! Start evaluation & saving...\n') self.logger['manually_stopped'] = True self.params['num_iterations'] = iteration self.save_model(policy.module) self.save_model(baseline, name='baseline') self.logger['elapsed_time'] = str(round(t.format_dict['elapsed'], 2)) + ' sec' # Evaluate on new test tasks self.logger['test_reward'] = evaluate_ppo(env_name, policy, baseline, eval_params) self.log_metrics({'test_reward': self.logger['test_reward']}) self.save_logs_to_file()
def run(): try: with open(path + '/logger.json', 'r') as f: params = json.load(f)['config'] except FileNotFoundError: print('WARNING CONFIG NOT FOUND. Using default parameters') params = dict() params['inner_lr'] = 0.1 params['ppo_epochs'] = 3 params['ppo_clip_ratio'] = 0.1 params['tau'] = 1.0 params['gamma'] = 0.99 params['seed'] = 42 eval_params['seed'] = params['seed'] cl_params['seed'] = params['seed'] rep_params['seed'] = params['seed'] algo = params['algo'] env_name = params['dataset'] anil = True if 'anil' in algo else False if 'maml' in algo or 'anil' in algo: ml_algo = params['algo'].split('_')[0] rl_algo = params['algo'].split('_')[1] elif 'ppo' == algo or 'random' == algo: ml_algo = '' rl_algo = 'ppo' else: ml_algo = '' rl_algo = params['algo'].split('_')[1] cl_params['algo'] = rl_algo rep_params['algo'] = rl_algo cl_params['anil'] = anil rep_params['anil'] = anil if 'ML' in env_name: state_size = 9 action_size = 4 rep_params['extra_info'], cl_params['extra_info'] = True, True else: state_size = 2 action_size = 2 rep_params['extra_info'], cl_params['extra_info'] = False, False if checkpoint is None: baseline_path = path + '/baseline.pt' if ml_algo == 'anil': head_path = path + '/head.pt' body_path = path + '/body.pt' else: policy_path = path + '/model.pt' else: baseline_path = path + f'/model_checkpoints/model_baseline_{checkpoint}.pt' if ml_algo == 'maml': policy_path = path + f'/model_checkpoints/model_{checkpoint}.pt' else: head_path = path + f'/model_checkpoints/model_head_{checkpoint}.pt' body_path = path + f'/model_checkpoints/model_body_{checkpoint}.pt' device = torch.device('cpu') random.seed(params['seed']) np.random.seed(params['seed']) torch.manual_seed(params['seed']) baseline = ch.models.robotics.LinearValue(state_size, action_size) baseline.load_state_dict(torch.load(baseline_path)) baseline.to(device) if ml_algo == 'anil': policy = DiagNormalPolicyANIL(state_size, action_size, params['fc_neurons']) policy.head.load_state_dict(torch.load(head_path)) policy.body.load_state_dict(torch.load(body_path)) else: policy = DiagNormalPolicy(state_size, action_size) policy.load_state_dict(torch.load(policy_path)) policy = MAML(policy, lr=eval_params['inner_lr']) policy.to(device) print(f'Testing {ml_algo}-{rl_algo} on {env_name}') if EVALUATE: t_test = 'train' if test_on_train else 'test' test_rewards, av_test_rew, av_test_suc, res_per_task = evaluate( rl_algo, env_name, policy, baseline, eval_params, anil=anil, render=render, test_on_train=test_on_train, each3=each3) print(f'Average meta-testing reward: {av_test_rew}') print(f'Average meta-testing success rate: {av_test_suc * 100}%') if save_res: with open(f"{params['algo']}_{t_test}_{params['seed']}.json", 'w') as f: f.write(json.dumps(res_per_task)) # with open(f"maml_trpo_test_{i}.json") as f: # res_per_task = json.loads(f.read()) for key, val in res_per_task.items(): print(f'{key}: \n\tRewards: {val[::2]}\n\tSuccess: {val[1::2]}\n') bar_plot_ml10(res_per_task, f"{params['algo']}_{t_test}_{params['seed']}.png") if RUN_CL: print('Running Continual Learning experiment...') run_cl_rl_exp(path, env_name, policy, baseline, cl_params, workers, test_on_train=test_on_train) if RUN_RC: print('Running Rep Change experiment...') run_rep_rl_exp(path, env_name, policy, baseline, rep_params)