예제 #1
0
def measure_change_through_time(path, env_name, policy, rep_params):
    env = make_env(env_name, 1, rep_params['seed'], max_path_length=rep_params['max_path_length'])
    global metrics
    metrics = ['CCA']

    sanity_task = env.sample_tasks(1)

    with torch.no_grad():
        env.set_task(sanity_task[0])
        env.seed(rep_params['seed'])
        env.reset()
        env_task = Runner(env)
        sanity_ep = env_task.run(policy, episodes=1)

    init_change_m = defaultdict(list)
    init_change_v = defaultdict(list)
    adapt_change_m = defaultdict(list)
    adapt_change_v = defaultdict(list)
    checkpoints = path + f'/model_checkpoints/'
    i = 0

    file_list = os.listdir(checkpoints)
    file_list = [file for file in file_list if 'baseline' not in file]
    models_list = {}
    for file in file_list:
        n_file = file.split('_')[-1]
        n_file = n_file.split('.')[0]
        n_file = int(n_file)
        models_list[n_file] = f'model_{n_file}.pt'

    prev_policy = policy
    for key in sorted(models_list.keys()):
        model_chckpnt = models_list[key]
        if i > 40:
            break
        i += 1

        print(f'Loading {model_chckpnt} ...')
        chckpnt_policy = DiagNormalPolicy(9, 4)
        chckpnt_policy.load_state_dict(torch.load(os.path.join(checkpoints, model_chckpnt)))
        chckpnt_policy = MAML(chckpnt_policy, lr=rep_params['inner_lr'])

        mean, variance = episode_mean_var(sanity_ep, policy, chckpnt_policy, layer=6)
        a_mean, a_variance = episode_mean_var(sanity_ep, prev_policy, chckpnt_policy, layer=6)
        init_change_m['CCA'] += [mean['CCA']]
        init_change_v['CCA'] += [variance['CCA']]
        adapt_change_m['CCA'] += [a_mean['CCA']]
        adapt_change_v['CCA'] += [a_variance['CCA']]

        prev_policy = chckpnt_policy

    for metric in metrics:
        plot_sim_across_steps(init_change_m[metric], init_change_v[metric], metric=metric,
                              title='Similarity between init and adapted (in %)')

    for metric in metrics:
        difference = [1 - x for x in adapt_change_m[metric]]
        plot_sim_across_steps(difference, adapt_change_v[metric], metric=metric,
                              title='Representation difference after each step (in %)')
예제 #2
0
    def run(self, env, device):

        baseline = ch.models.robotics.LinearValue(env.state_size,
                                                  env.action_size)
        policy = DiagNormalPolicy(env.state_size, env.action_size)

        self.log_model(policy, device, input_shape=(1, env.state_size))

        t = trange(self.params['num_iterations'], desc='Iteration', position=0)
        try:
            for iteration in t:

                iter_reward = 0.0

                task_list = env.sample_tasks(self.params['batch_size'])
                for task_i in trange(len(task_list),
                                     leave=False,
                                     desc='Task',
                                     position=0):
                    task = task_list[task_i]
                    env.set_task(task)
                    env.reset()
                    task = Runner(env)

                    episodes = task.run(policy, episodes=params['n_episodes'])
                    task_reward = episodes.reward().sum().item(
                    ) / params['n_episodes']

                    iter_reward += task_reward

                # Log
                average_return = iter_reward / self.params['batch_size']
                metrics = {'average_return': average_return}

                t.set_postfix(metrics)
                self.log_metrics(metrics)

                if iteration % self.params['save_every'] == 0:
                    self.save_model_checkpoint(policy, str(iteration + 1))
                    self.save_model_checkpoint(
                        baseline, 'baseline_' + str(iteration + 1))

        # Support safely manually interrupt training
        except KeyboardInterrupt:
            print(
                '\nManually stopped training! Start evaluation & saving...\n')
            self.logger['manually_stopped'] = True
            self.params['num_iterations'] = iteration

        self.save_model(policy)
        self.save_model(baseline, name='baseline')

        self.logger['elapsed_time'] = str(round(t.format_dict['elapsed'],
                                                2)) + ' sec'
        # Evaluate on new test tasks
        policy = MAML(policy, lr=self.params['inner_lr'])
        self.logger['test_reward'] = evaluate_ppo(env_name, policy, baseline,
                                                  params)
        self.log_metrics({'test_reward': self.logger['test_reward']})
        self.save_logs_to_file()
예제 #3
0
    def run(self, env, device):

        set_device(device)
        baseline = ch.models.robotics.LinearValue(env.state_size,
                                                  env.action_size)
        policy = DiagNormalPolicy(env.state_size, env.action_size)

        self.log_model(policy, device, input_shape=(1, env.state_size))

        t = trange(self.params['num_iterations'], desc='Iteration', position=0)
        try:
            for iteration in t:

                iter_loss = 0.0
                iter_reward = 0.0
                # iter_success_per_task = {}
                iter_replays = []
                iter_policies = []

                task_list = env.sample_tasks(self.params['meta_batch_size'])

                for task_i in trange(len(task_list),
                                     leave=False,
                                     desc='Task',
                                     position=0):
                    task = task_list[task_i]
                    # task_id = f'task_{task["task"]}'

                    learner = deepcopy(policy)
                    env.set_task(task)
                    env.reset()
                    task = Runner(env, extra_info=extra_info)

                    # Adapt
                    learner, eval_loss, task_replay, task_rew, task_suc = fast_adapt_trpo(
                        task, learner, baseline, self.params, first_order=True)

                    # Calculate average success rate of support episodes
                    # task_adapt_suc = get_ep_successes(task_replay[0]) / self.params['adapt_batch_size']
                    # iter_success_per_task[task_id + '_adapt'] = task_adapt_suc
                    # iter_success_per_task[task_id] = task_suc
                    iter_reward += task_rew
                    iter_loss += eval_loss.item()
                    iter_replays.append(task_replay)
                    iter_policies.append(learner)

                # Log
                average_return = iter_reward / self.params['meta_batch_size']
                average_loss = iter_loss / self.params['meta_batch_size']
                metrics = {
                    'average_return': average_return,
                    'loss': average_loss
                }
                t.set_postfix(metrics)
                # metrics.update(iter_success_per_task)
                self.log_metrics(metrics)

                # Meta-optimize
                meta_optimize_trpo(self.params, policy, baseline, iter_replays,
                                   iter_policies)

                if iteration % self.params['save_every'] == 0:
                    self.save_model_checkpoint(policy, str(iteration + 1))
                    self.save_model_checkpoint(
                        baseline, 'baseline_' + str(iteration + 1))

        # Support safely manually interrupt training
        except KeyboardInterrupt:
            print(
                '\nManually stopped training! Start evaluation & saving...\n')
            self.logger['manually_stopped'] = True
            self.params['num_iterations'] = iteration

        self.save_model(policy)
        self.save_model(baseline, name='baseline')

        self.logger['elapsed_time'] = str(round(t.format_dict['elapsed'],
                                                2)) + ' sec'
        # Evaluate on new test tasks
        self.logger['test_reward'] = evaluate_trpo(env_name, policy, baseline,
                                                   eval_params)
        self.log_metrics({'test_reward': self.logger['test_reward']})
        self.save_logs_to_file()
예제 #4
0
    def run(self, env, device):

        set_device(device)
        baseline = ch.models.robotics.LinearValue(env.state_size,
                                                  env.action_size)
        policy = DiagNormalPolicy(env.state_size, env.action_size)
        policy = MAML(policy, lr=self.params['inner_lr'])

        meta_optimizer = torch.optim.Adam(policy.parameters(),
                                          lr=self.params['outer_lr'])

        self.log_model(policy, device, input_shape=(1, env.state_size))

        t = trange(self.params['num_iterations'], desc='Iteration', position=0)
        try:
            for iteration in t:
                meta_optimizer.zero_grad()

                iter_reward = 0.0
                iter_loss = 0.0

                task_list = env.sample_tasks(self.params['meta_batch_size'])

                for task_i in trange(len(task_list),
                                     leave=False,
                                     desc='Task',
                                     position=0):
                    task = task_list[task_i]

                    learner = policy.clone()
                    env.set_task(task)
                    env.reset()
                    task = Runner(env, extra_info=extra_info)

                    # Adapt
                    eval_loss, task_rew, task_suc = fast_adapt_ppo(
                        task, learner, baseline, self.params)

                    iter_reward += task_rew
                    iter_loss += eval_loss
                    # print(f'\tTask {task_i} reward: {task_rew} | Loss : {eval_loss.item()}')

                # Log
                average_return = iter_reward / self.params['meta_batch_size']
                av_loss = iter_loss / self.params['meta_batch_size']

                # print(f'Iter {iteration} average reward: {average_return} | Loss : {av_loss}')
                metrics = {
                    'average_return': average_return,
                    'loss': av_loss.item()
                }

                t.set_postfix(metrics)
                self.log_metrics(metrics)

                # Meta-optimize: Back-propagate through the accumulated gradients and optimize
                av_loss.backward()
                meta_optimizer.step()

                if iteration % self.params['save_every'] == 0:
                    self.save_model_checkpoint(policy.module,
                                               str(iteration + 1))
                    self.save_model_checkpoint(
                        baseline, 'baseline_' + str(iteration + 1))

        # Support safely manually interrupt training
        except KeyboardInterrupt:
            print(
                '\nManually stopped training! Start evaluation & saving...\n')
            self.logger['manually_stopped'] = True
            self.params['num_iterations'] = iteration

        self.save_model(policy.module)
        self.save_model(baseline, name='baseline')

        self.logger['elapsed_time'] = str(round(t.format_dict['elapsed'],
                                                2)) + ' sec'
        # Evaluate on new test tasks
        self.logger['test_reward'] = evaluate_ppo(env_name, policy, baseline,
                                                  eval_params)
        self.log_metrics({'test_reward': self.logger['test_reward']})
        self.save_logs_to_file()
예제 #5
0
def run():
    try:
        with open(path + '/logger.json', 'r') as f:
            params = json.load(f)['config']
    except FileNotFoundError:
        print('WARNING CONFIG NOT FOUND. Using default parameters')
        params = dict()
        params['inner_lr'] = 0.1
        params['ppo_epochs'] = 3
        params['ppo_clip_ratio'] = 0.1
        params['tau'] = 1.0
        params['gamma'] = 0.99
        params['seed'] = 42

    eval_params['seed'] = params['seed']
    cl_params['seed'] = params['seed']
    rep_params['seed'] = params['seed']
    algo = params['algo']
    env_name = params['dataset']

    anil = True if 'anil' in algo else False

    if 'maml' in algo or 'anil' in algo:
        ml_algo = params['algo'].split('_')[0]
        rl_algo = params['algo'].split('_')[1]
    elif 'ppo' == algo or 'random' == algo:
        ml_algo = ''
        rl_algo = 'ppo'
    else:
        ml_algo = ''
        rl_algo = params['algo'].split('_')[1]

    cl_params['algo'] = rl_algo
    rep_params['algo'] = rl_algo
    cl_params['anil'] = anil
    rep_params['anil'] = anil
    if 'ML' in env_name:
        state_size = 9
        action_size = 4
        rep_params['extra_info'], cl_params['extra_info'] = True, True
    else:
        state_size = 2
        action_size = 2
        rep_params['extra_info'], cl_params['extra_info'] = False, False

    if checkpoint is None:
        baseline_path = path + '/baseline.pt'
        if ml_algo == 'anil':
            head_path = path + '/head.pt'
            body_path = path + '/body.pt'
        else:
            policy_path = path + '/model.pt'
    else:
        baseline_path = path + f'/model_checkpoints/model_baseline_{checkpoint}.pt'
        if ml_algo == 'maml':
            policy_path = path + f'/model_checkpoints/model_{checkpoint}.pt'
        else:
            head_path = path + f'/model_checkpoints/model_head_{checkpoint}.pt'
            body_path = path + f'/model_checkpoints/model_body_{checkpoint}.pt'

    device = torch.device('cpu')
    random.seed(params['seed'])
    np.random.seed(params['seed'])
    torch.manual_seed(params['seed'])

    baseline = ch.models.robotics.LinearValue(state_size, action_size)
    baseline.load_state_dict(torch.load(baseline_path))
    baseline.to(device)

    if ml_algo == 'anil':
        policy = DiagNormalPolicyANIL(state_size, action_size,
                                      params['fc_neurons'])
        policy.head.load_state_dict(torch.load(head_path))
        policy.body.load_state_dict(torch.load(body_path))
    else:
        policy = DiagNormalPolicy(state_size, action_size)
        policy.load_state_dict(torch.load(policy_path))

    policy = MAML(policy, lr=eval_params['inner_lr'])
    policy.to(device)

    print(f'Testing {ml_algo}-{rl_algo} on {env_name}')
    if EVALUATE:
        t_test = 'train' if test_on_train else 'test'
        test_rewards, av_test_rew, av_test_suc, res_per_task = evaluate(
            rl_algo,
            env_name,
            policy,
            baseline,
            eval_params,
            anil=anil,
            render=render,
            test_on_train=test_on_train,
            each3=each3)
        print(f'Average meta-testing reward: {av_test_rew}')
        print(f'Average meta-testing success rate: {av_test_suc * 100}%')

        if save_res:
            with open(f"{params['algo']}_{t_test}_{params['seed']}.json",
                      'w') as f:
                f.write(json.dumps(res_per_task))
        # with open(f"maml_trpo_test_{i}.json") as f:
        #     res_per_task = json.loads(f.read())

        for key, val in res_per_task.items():
            print(f'{key}: \n\tRewards: {val[::2]}\n\tSuccess: {val[1::2]}\n')

        bar_plot_ml10(res_per_task,
                      f"{params['algo']}_{t_test}_{params['seed']}.png")

    if RUN_CL:
        print('Running Continual Learning experiment...')
        run_cl_rl_exp(path,
                      env_name,
                      policy,
                      baseline,
                      cl_params,
                      workers,
                      test_on_train=test_on_train)
    if RUN_RC:
        print('Running Rep Change experiment...')
        run_rep_rl_exp(path, env_name, policy, baseline, rep_params)