예제 #1
0
def main(args):
    with open(args.config, 'r') as f:
        config = json.load(f)

    if args.seed is not None:
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    env = gym.make(config['env-name'], **config['env-kwargs'])
    env.close()

    # Policy
    policy = get_policy_for_env(env,
                                hidden_sizes=config['hidden-sizes'],
                                nonlinearity=config['nonlinearity'])
    with open(args.policy, 'rb') as f:
        state_dict = torch.load(f, map_location=torch.device(args.device))
        # 加载模型
        policy.load_state_dict(state_dict)
    policy.share_memory()

    # Baseline
    baseline = LinearFeatureBaseline(get_input_size(env))

    # Sampler
    sampler = MultiTaskSampler(config['env-name'],
                               env_kwargs=config['env-kwargs'],
                               batch_size=config['fast-batch-size'],
                               policy=policy,
                               baseline=baseline,
                               env=env,
                               seed=args.seed,
                               num_workers=args.num_workers)

    logs = {'tasks': []}
    train_returns, valid_returns = [], []
    # test phase : update NN
    for batch in trange(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        train_episodes, valid_episodes = sampler.sample(
            tasks,
            num_steps=config['num-steps'],
            fast_lr=config['fast-lr'],
            gamma=config['gamma'],
            gae_lambda=config['gae-lambda'],
            device=args.device)

        logs['tasks'].extend(tasks)
        train_returns.append(get_returns(train_episodes[0]))
        valid_returns.append(get_returns(valid_episodes))
        # definition of get_returns
        # def get_returns(episodes):
        #     return to_numpy([episode.rewards.sum(dim=0) for episode in episodes])

    logs['train_returns'] = np.concatenate(train_returns, axis=0)
    logs['valid_returns'] = np.concatenate(valid_returns, axis=0)

    with open(args.output, 'wb') as f:
        np.savez(f, **logs)
def test_init(env_name, num_workers):
    batch_size = 10
    # Environment
    env = gym.make(env_name)
    env.close()
    # Policy and Baseline
    policy = get_policy_for_env(env)
    baseline = LinearFeatureBaseline(get_input_size(env))

    sampler = MultiTaskSampler(env_name,
                               {}, # env_kwargs
                               batch_size,
                               policy,
                               baseline,
                               num_workers=num_workers)
    sampler.close()
예제 #3
0
파일: main.py 프로젝트: imhgchoi/MAML-RL
def main(args, config):

    set_random_seed(args)

    # Environment
    env = get_environment(args, config)

    # Policy & Baseline
    policy = get_policy_for_env(args, env, hidden_sizes=config['hidden-sizes'], 
                                           nonlinearity=config['nonlinearity'])
    policy.share_memory()  # this is done to share memory across processes for multiprocessing
    
    baseline = LinearFeatureBaseline(reduce(mul, env.observation_space.shape, 1))

    # Sampler
    sampler = MultiTaskSampler(config['env-name'],
                               env_kwargs=config.get('env-kwargs', {}),
                               batch_size=config['fast-batch-size'],
                               policy=policy,
                               baseline=baseline,
                               env=env,
                               seed=args.seed,
                               num_workers=args.num_workers)

    # Meta Model
    metalearner = MAMLTRPO(policy,
                           fast_lr=config['fast-lr'],
                           first_order=config['first-order'],
                           device=args.device)

    # Solver 
    solver = Solver(args, config, policy, sampler, metalearner)
    solver.train(args, config)
def test_sample(env_name, batch_size, num_tasks, num_steps, num_workers):
    # Environment
    env = gym.make(env_name)
    env.close()
    # Policy and Baseline
    policy = get_policy_for_env(env)
    baseline = LinearFeatureBaseline(get_input_size(env))

    sampler = MultiTaskSampler(env_name,
                               {}, # env_kwargs
                               batch_size,
                               policy,
                               baseline,
                               num_workers=num_workers)
    tasks = sampler.sample_tasks(num_tasks=num_tasks)
    train_episodes, valid_episodes = sampler.sample(tasks,
                                                    num_steps=num_steps)
    sampler.close()

    assert len(train_episodes) == num_steps
    assert len(train_episodes[0]) == num_tasks
    assert isinstance(train_episodes[0][0], BatchEpisodes)

    assert len(valid_episodes) == num_tasks
    assert isinstance(valid_episodes[0], BatchEpisodes)
예제 #5
0
def main(args):
    with open(args.config, 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

    if args.output_folder is not None:
        if not os.path.exists(args.output_folder):
            os.makedirs(args.output_folder)
        config_filename = os.path.join(args.output_folder,
                                       str(args.seed) + '_config.json')
        policy_filename = os.path.join(args.output_folder,
                                       str(args.seed) + '_policy.th')
        result_filename_txt = os.path.join(args.output_folder,
                                           str(args.seed) + '_results.txt')
        result_filename_pickle = os.path.join(
            args.output_folder,
            str(args.seed) + '_results.pickle')

        with open(config_filename, 'w') as f:
            config.update(vars(args))
            json.dump(config, f, indent=2)

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    results = {
        'train_costs': [],
        'test_costs': [],
        'train_costs_sum': [],  # the cost
        'test_costs_sum': [],
        'train_costs_mean': [],
        'test_costs_mean':
        [],  # the evaluation for grid-world, key-door, and mountain-car problems
        'train_returns': [],
        'test_returns': [],
        'train_returns_mean': [],
        'test_returns_mean': [],  # the evaluation for the treasure problem
        'train_returns_std': [],
        'test_returns_std': [],
    }

    # env = gym.make(config['env-name'], **config['env-kwargs'])
    env = gym.make(config['env-name'])
    env.close()

    # Policy
    policy = get_policy_for_env(env,
                                hidden_sizes=config['hidden-sizes'],
                                nonlinearity=config['nonlinearity'])
    policy.share_memory()

    # Baseline
    baseline = LinearFeatureBaseline(get_input_size(env))

    # Sampler
    sampler = MultiTaskSampler(config['env-name'],
                               env_kwargs=config['env-kwargs'],
                               batch_size=config['fast-batch-size'],
                               policy=policy,
                               baseline=baseline,
                               env=env,
                               seed=args.seed,
                               num_workers=args.num_workers)

    metalearner = MAMLTRPO(policy,
                           fast_lr=config['fast-lr'],
                           first_order=config['first-order'],
                           device=args.device)

    num_iterations = 0
    for batch in trange(config['num-batches']):
        tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size'])
        # print(tasks)

        futures = sampler.sample_async(tasks,
                                       num_steps=config['num-steps'],
                                       fast_lr=config['fast-lr'],
                                       gamma=config['gamma'],
                                       gae_lambda=config['gae-lambda'],
                                       device=args.device)
        # print(futures)

        logs = metalearner.step(
            *futures,
            max_kl=config['max-kl'],
            cg_iters=config['cg-iters'],
            cg_damping=config['cg-damping'],
            ls_max_steps=config['ls-max-steps'],
            ls_backtrack_ratio=config['ls-backtrack-ratio'])
        # print('logs')

        train_episodes, valid_episodes = sampler.sample_wait(futures)
        # print('train_episodes')

        num_iterations += sum(
            sum(episode.lengths) for episode in train_episodes[0])
        num_iterations += sum(
            sum(episode.lengths) for episode in valid_episodes)
        logs.update(tasks=tasks,
                    num_iterations=num_iterations,
                    train_returns=get_returns(train_episodes[0]),
                    valid_returns=get_returns(valid_episodes))
        train_returns = get_discounted_returns(train_episodes[0])
        test_returns = get_discounted_returns(valid_episodes)
        train_costs = get_costs(train_episodes[0])
        test_costs = get_costs(valid_episodes)

        # Save results
        results['train_returns'].append(train_returns)
        results['test_returns'].append(test_returns)
        results['train_returns_mean'].append(np.mean(train_returns))
        results['test_returns_mean'].append(np.mean(test_returns))
        results['train_returns_std'].append(np.std(train_returns))
        results['test_returns_std'].append(np.std(test_returns))

        results['train_costs'].append(train_costs)
        results['test_costs'].append(test_costs)
        results['train_costs_sum'].append(np.sum(train_costs))
        results['test_costs_sum'].append(np.sum(test_costs))
        results['train_costs_mean'].append(np.mean(train_costs))
        results['test_costs_mean'].append(np.mean(test_costs))

        with open(result_filename_txt, "w") as file:
            file.write(str(results))
        with open(result_filename_pickle, "wb") as file:
            dump(results, file, protocol=2)

        # Save policy
        if args.output_folder is not None:
            with open(policy_filename, 'wb') as f:
                torch.save(policy.state_dict(), f)

    print(np.sum(results['train_costs_sum']))
예제 #6
0
def main(args):
    with open(args.config, 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

    if args.output_folder is not None:
        if not os.path.exists(args.output_folder):
            os.makedirs(args.output_folder)
        policy_filename = os.path.join(args.output_folder, 'policy.th')
        config_filename = os.path.join(args.output_folder, 'config.json')

        with open(config_filename, 'w') as f:
            config.update(vars(args))
            json.dump(config, f, indent=2)

    if args.seed is not None:
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    print(config)

    writer = SummaryWriter(logdir='./log')

    env = gym.make(config['env-name'], **config.get('env-kwargs', {}))
    env.close()

    # Policy
    policy = get_policy_for_env(env,
                                hidden_sizes=config['hidden-sizes'],
                                nonlinearity=config['nonlinearity'])
    policy.share_memory()

    # Baseline
    baseline = LinearFeatureBaseline(get_input_size(env))

    # Sampler
    sampler = MultiTaskSampler(config['env-name'],
                               env_kwargs=config.get('env-kwargs', {}),
                               batch_size=config['fast-batch-size'],
                               policy=policy,
                               baseline=baseline,
                               env=env,
                               seed=args.seed,
                               num_workers=args.num_workers)

    metalearner = MAMLTRPO(policy,
                           fast_lr=config['fast-lr'],
                           first_order=config['first-order'],
                           device=args.device)

    num_iterations = 0
    for batch in trange(config['num-batches']):
        tasks = sampler.sample_tasks(
            num_tasks=config['meta-batch-size'])  # (meta-batch-size, K-arm)
        futures = sampler.sample_async(tasks,
                                       num_steps=config['num-steps'],
                                       fast_lr=config['fast-lr'],
                                       gamma=config['gamma'],
                                       gae_lambda=config['gae-lambda'],
                                       device=args.device)
        logs = metalearner.step(
            *futures,
            max_kl=config['max-kl'],
            cg_iters=config['cg-iters'],
            cg_damping=config['cg-damping'],
            ls_max_steps=config['ls-max-steps'],
            ls_backtrack_ratio=config['ls-backtrack-ratio'])

        train_episodes, valid_episodes = sampler.sample_wait(futures)
        num_iterations += sum(
            sum(episode.lengths) for episode in train_episodes[0])
        num_iterations += sum(
            sum(episode.lengths) for episode in valid_episodes)
        logs.update(tasks=tasks,
                    num_iterations=num_iterations,
                    train_returns=get_returns(train_episodes[0]),
                    valid_returns=get_returns(valid_episodes))

        # print(logs)

        writer.add_scalar('MAML/Loss Before', logs['loss_before'].mean(),
                          num_iterations)
        writer.add_scalar('MAML/KL Before', logs['kl_before'].mean(),
                          num_iterations)
        if 'loss_after' in logs:
            writer.add_scalar('MAML/Loss After', logs['loss_after'].mean(),
                              num_iterations)
        if 'kl_after' in logs:
            writer.add_scalar('MAML/KL After', logs['kl_after'].mean(),
                              num_iterations)
        writer.add_scalar('MAML/Train Returns', logs['train_returns'].sum(),
                          num_iterations)
        writer.add_scalar('MAML/Valid Returns', logs['valid_returns'].sum(),
                          num_iterations)
        writer.add_scalar(
            'MAML/Train Cumulative Regret',
            sum([task['mean'].max()
                 for task in logs['tasks']]) * config['fast-batch-size'] -
            logs['train_returns'].sum(), num_iterations)
        writer.add_scalar(
            'MAML/Valid Cumulative Regret',
            sum([task['mean'].max()
                 for task in logs['tasks']]) * config['fast-batch-size'] -
            logs['valid_returns'].sum(), num_iterations)

        # Save policy
        if args.output_folder is not None:
            with open(policy_filename, 'wb') as f:
                torch.save(policy.state_dict(), f)

    writer.close()
예제 #7
0
파일: train.py 프로젝트: yatindandi/BOIL
def main(args):
    with open(args.config, 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

    if args.output_folder is not None:
        if not os.path.exists(args.output_folder):
            os.makedirs(args.output_folder)
        policy_filename = os.path.join(args.output_folder, 'policy.th')
        config_filename = os.path.join(args.output_folder, 'config.json')

        with open(config_filename, 'w') as f:
            config.update(vars(args))
            json.dump(config, f, indent=2)

    if args.seed is not None:
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    env = gym.make(config['env-name'], **config['env-kwargs'])
    env.close()

    # Policy
    policy = get_policy_for_env(env,
                                hidden_sizes=config['hidden-sizes'],
                                nonlinearity=config['nonlinearity'])
    policy.share_memory()
    print(policy)

    # Baseline
    baseline = LinearFeatureBaseline(get_input_size(env))

    # Sampler
    sampler = MultiTaskSampler(config['env-name'],
                               env_kwargs=config['env-kwargs'],
                               batch_size=config['fast-batch-size'],
                               policy=policy,
                               baseline=baseline,
                               env=env,
                               seed=args.seed,
                               num_workers=args.num_workers,
                               args=args)

    metalearner = MAMLTRPO(policy,
                           fast_lr=config['fast-lr'],
                           first_order=config['first-order'],
                           device=args.device)

    num_iterations = 0

    for batch in trange(config['num-batches']):
        tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size'])
        futures = sampler.sample_async(tasks,
                                       num_steps=config['num-steps'],
                                       fast_lr=config['fast-lr'],
                                       gamma=config['gamma'],
                                       gae_lambda=config['gae-lambda'],
                                       device=args.device)

        logs = metalearner.step(
            *futures,
            max_kl=config['max-kl'],
            cg_iters=config['cg-iters'],
            cg_damping=config['cg-damping'],
            ls_max_steps=config['ls-max-steps'],
            ls_backtrack_ratio=config['ls-backtrack-ratio'],
            args=args)

        train_episodes, valid_episodes = sampler.sample_wait(futures)
        num_iterations += sum(
            sum(episode.lengths) for episode in train_episodes[0])
        num_iterations += sum(
            sum(episode.lengths) for episode in valid_episodes)
        logs.update(tasks=tasks,
                    num_iterations=num_iterations,
                    train_returns=get_returns(train_episodes[0]),
                    valid_returns=get_returns(valid_episodes))

        # Save policy
        if args.output_folder is not None:
            with open(policy_filename, 'wb') as f:
                torch.save(policy.state_dict(), f)
예제 #8
0
def main(args):
    with open(args.config, 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    if args.env_name in all_envs:
        config.update({
            "env-name": args.env_name + "-v0",
            "env-kwargs": {},
            "fast-batch-size": 16,
            "num-batches": 2000,
            "meta-batch-size": 1
        })
    if args.output_folder is not None:
        if not os.path.exists(args.output_folder):
            os.makedirs(args.output_folder)
        policy_filename = os.path.join(args.output_folder, 'policy.th')
        config_filename = os.path.join(args.output_folder, 'config.json')
        with open(config_filename, 'w') as f:
            config.update(vars(args))
            json.dump(config, f, indent=2)
    if args.seed is not None:
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    model_name = "maml"
    os.makedirs(f"{LOG_DIR}/{model_name}/{config['env-name']}/", exist_ok=True)
    run_num = len([
        n for n in os.listdir(f"{LOG_DIR}/{model_name}/{config['env-name']}/")
    ])
    log_path = f"{LOG_DIR}/{model_name}/{config['env-name']}/logs_{run_num}.txt"

    env = gym.make(config['env-name'], **config.get('env-kwargs', {}))
    env.close()
    policy = get_policy_for_env(env,
                                hidden_sizes=config['hidden-sizes'],
                                nonlinearity=config['nonlinearity'])
    policy.share_memory()
    baseline = LinearFeatureBaseline(get_input_size(env))
    sampler = MultiTaskSampler(config['env-name'],
                               env_kwargs=config.get('env-kwargs', {}),
                               batch_size=config['fast-batch-size'],
                               policy=policy,
                               baseline=baseline,
                               env=env,
                               seed=args.seed,
                               num_workers=args.num_workers)
    metalearner = MAMLTRPO(policy,
                           fast_lr=config['fast-lr'],
                           first_order=config['first-order'],
                           device=args.device)
    num_iterations = 0
    total_rewards = []
    start = time.time()
    step = 0
    # for batch in range(config['num-batches']+1):
    while step <= 500000:
        tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size'])
        futures = sampler.sample_async(tasks,
                                       num_steps=config['num-steps'],
                                       fast_lr=config['fast-lr'],
                                       gamma=config['gamma'],
                                       gae_lambda=config['gae-lambda'],
                                       device=args.device)
        logs = metalearner.step(
            *futures,
            max_kl=config['max-kl'],
            cg_iters=config['cg-iters'],
            cg_damping=config['cg-damping'],
            ls_max_steps=config['ls-max-steps'],
            ls_backtrack_ratio=config['ls-backtrack-ratio'])
        train_episodes, valid_episodes = sampler.sample_wait(futures)
        num_iterations += sum(
            sum(episode.lengths) for episode in train_episodes[0])
        num_iterations += sum(
            sum(episode.lengths) for episode in valid_episodes)
        logs.update(tasks=tasks,
                    num_iterations=num_iterations,
                    train_returns=get_returns(train_episodes[0]),
                    valid_returns=get_returns(valid_episodes))
        # Save policy
        old_step = step
        step += 250 if args.env_name in all_envs else train_episodes[0][
            0].lengths[0]
        if old_step == 0 or step // 1000 > old_step // 1000:
            rollouts = logs["valid_returns"][0]
            reward = np.mean(rollouts, -1)
            ep = step // 1000
            total_rewards.append(reward)
            string = f"Step: {int(1000*ep):7d}, Reward: {total_rewards[-1]:9.3f} [{np.std(rollouts):8.3f}], Avg: {np.mean(total_rewards, axis=0):9.3f} ({0.0:.3f}) <{get_time(start)}> ({{}})"
            print(string)
            with open(log_path, "a+") as f:
                f.write(f"{string}\n")
예제 #9
0
def main(args):
    with open(args.config, 'r') as f:
        config = json.load(f)

    if args.seed is not None:
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    # env = gym.make(config['env-name'], **config['env-kwargs'])
    env = gym.make(config['env-name'], **config.get('env-kwargs', {}))
    env.close()

    # Policy
    policy = get_policy_for_env(env,
                                hidden_sizes=config['hidden-sizes'],
                                nonlinearity=config['nonlinearity'])
    with open(args.policy, 'rb') as f:
        state_dict = torch.load(f, map_location=torch.device(args.device))
        policy.load_state_dict(state_dict)
    policy.share_memory()

    # Baseline
    baseline = LinearFeatureBaseline(get_input_size(env))

    # Sampler
    sampler = MultiTaskSampler(config['env-name'],
                               env_kwargs=config.get('env-kwargs', {}),
                               batch_size=config['fast-batch-size'],
                               policy=policy,
                               baseline=baseline,
                               env=env,
                               seed=args.seed,
                               num_workers=args.num_workers)

    logs = {'tasks': []}
    train_returns, valid_returns = [], []

    # to see the grad0 ~ multi gradient
    grad_returns = []
    for i in range(Grad_Steps):
        grad_returns.append([])
    # to see the grad0 ~ multi gradient

    for batch in trange(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        train_episodes, valid_episodes = sampler.sample(
            tasks,
            num_steps=config['num-steps'],
            fast_lr=config['fast-lr'],
            gamma=config['gamma'],
            gae_lambda=config['gae-lambda'],
            device=args.device)

        logs['tasks'].extend(tasks)

        # to see the grad0 ~ multi gradient
        for i in range(Grad_Steps):
            grad_returns[i].append(get_returns(train_episodes[i]))
        for i in range(Grad_Steps):
            logs['grad' + str(i) + '_returns'] = np.concatenate(
                grad_returns[i], axis=0)
        # to see the grad0 ~ multi gradient

        train_returns.append(get_returns(train_episodes[0]))
        valid_returns.append(get_returns(valid_episodes))

    logs['train_returns'] = np.concatenate(train_returns, axis=0)
    logs['valid_returns'] = np.concatenate(valid_returns, axis=0)

    # to see the grad0 ~ multi gradient
    value = [0] * (Grad_Steps + 1)
    for i in range(Grad_Steps):
        value[i] = logs['grad' + str(i) + '_returns'].mean()
    value[Grad_Steps] = logs['valid_returns'].mean()
    print(value)
    print(logs['valid_returns'].mean())
    # to see the grad0 ~ multi gradient

    with open(args.output, 'wb') as f:
        np.savez(f, **logs)
예제 #10
0
def main(args):
    with open(args.config, 'r') as f:
        config = json.load(f)

    if args.seed is not None:
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    # env = gym.make(config['env-name'], **config['env-kwargs'])
    env = gym.make(config['env-name'], **config.get('env-kwargs', {}))
    env.close()

    # Policy
    policy = get_policy_for_env(env,
                                hidden_sizes=config['hidden-sizes'],
                                nonlinearity=config['nonlinearity'])

    with open(args.policy, 'rb') as f:
        state_dict = torch.load(f, map_location=torch.device(args.device))
        policy.load_state_dict(state_dict)
    policy.share_memory()

    # Baseline
    baseline = LinearFeatureBaseline(get_input_size(env))

    # Sampler
    sampler = MultiTaskSampler(config['env-name'],
                               env_kwargs=config.get('env-kwargs', {}),
                               batch_size=config['fast-batch-size'],
                               policy=policy,
                               baseline=baseline,
                               env=env,
                               seed=args.seed,
                               num_workers=args.num_workers)

    # TODO: online adaptation step-1
    metalearner = MAMLTRPO(policy,
                           fast_lr=config['fast-lr'],
                           first_order=config['first-order'],
                           device=args.device)
    # endtodo

    logs = {'tasks': []}
    train_returns, valid_returns = [], []
    num_iterations = 0
    for batch in trange(args.num_batches):
        """old version of test-my.py"""
        # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        # train_episodes, valid_episodes = sampler.sample(tasks,
        #                                                 num_steps=config['num-steps'],
        #                                                 fast_lr=config['fast-lr'],
        #                                                 gamma=config['gamma'],
        #                                                 gae_lambda=config['gae-lambda'],
        #                                                 device=args.device)
        #
        # logs['tasks'].extend(tasks)
        # train_returns.append(get_returns(train_episodes[0]))
        # valid_returns.append(get_returns(valid_episodes))
        """new version of test-my-plus.py"""
        # TODO: online adaptation step-2
        tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size'])
        futures = sampler.sample_async(tasks,
                                       num_steps=config['num-steps'],
                                       fast_lr=config['fast-lr'],
                                       gamma=config['gamma'],
                                       gae_lambda=config['gae-lambda'],
                                       device=args.device)
        logs = metalearner.step(
            *futures,
            max_kl=config['max-kl'],
            cg_iters=config['cg-iters'],
            cg_damping=config['cg-damping'],
            ls_max_steps=config['ls-max-steps'],
            ls_backtrack_ratio=config['ls-backtrack-ratio'])

        train_episodes, valid_episodes = sampler.sample_wait(futures)
        num_iterations += sum(
            sum(episode.lengths) for episode in train_episodes[0])
        num_iterations += sum(
            sum(episode.lengths) for episode in valid_episodes)
        logs.update(tasks=tasks,
                    num_iterations=num_iterations,
                    train_returns=get_returns(train_episodes[0]),
                    valid_returns=get_returns(valid_episodes))

        train_returns.append(get_returns(train_episodes[0]))
        valid_returns.append(get_returns(valid_episodes))

        # for name,param in policy.layer1.named_parameters():
        #     print(name,param)
        # endtodo

    logs['train_returns'] = np.concatenate(train_returns, axis=0)
    logs['valid_returns'] = np.concatenate(valid_returns, axis=0)
    print('name', args.output)
    with open(args.output, 'wb') as f:
        np.savez(f, **logs)
예제 #11
0
def main(args):
    with open(args.config, 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

    if args.output_folder is not None:
        if not os.path.exists(args.output_folder):
            os.makedirs(args.output_folder)
        policy_filename = os.path.join(args.output_folder, 'policy.th')
        config_filename = os.path.join(args.output_folder, 'config.json')

        with open(config_filename, 'w') as f:
            config.update(vars(args))
            json.dump(config, f, indent=2)

    # Set tb_writer
    args.log_name = "env-name::%s_num-steps::%s_fast-lr::%s_log" % (
        config["env-name"], config["num-steps"], config["fast-lr"])
    tb_writer = SummaryWriter("./{0}/tb_{1}_logs".format(args.output_folder, args.log_name))
    log = set_log(args)

    # Set seed
    if args.seed is not None:
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    env = make_env(config["env-name"])()
    env.close()

    # Policy
    policy = get_policy_for_env(
        env,
        hidden_sizes=config['hidden-sizes'],
        nonlinearity=config['nonlinearity'])
    policy.share_memory()

    # Baseline
    baseline = LinearFeatureBaseline(get_input_size(env))

    # Sampler
    sampler = MultiTaskSampler(
        config['env-name'],
        env_kwargs=config.get('env-kwargs', {}),
        batch_size=config['fast-batch-size'],
        policy=policy,
        baseline=baseline,
        env=env,
        seed=args.seed,
        num_workers=args.num_workers)

    metalearner = MAMLTRPO(
        policy,
        fast_lr=config['fast-lr'],
        first_order=config['first-order'],
        device=args.device)

    best_score = -np.inf

    for batch in range(config['num-batches']):
        tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size'])

        futures = sampler.sample_async(
            tasks,
            num_steps=config['num-steps'],
            fast_lr=config['fast-lr'],
            gamma=config['gamma'],
            gae_lambda=config['gae-lambda'],
            device=args.device)

        metalearner.step(
            *futures,
            max_kl=config['max-kl'],
            cg_iters=config['cg-iters'],
            cg_damping=config['cg-damping'],
            ls_max_steps=config['ls-max-steps'],
            ls_backtrack_ratio=config['ls-backtrack-ratio'])

        # For logging
        train_episodes, valid_episodes = sampler.sample_wait(futures)
        train_score = np.mean(get_returns(train_episodes[0]))
        val_score = np.mean(get_returns(valid_episodes))

        log[args.log_name].info("At iteration {}, train_reward: {:.3f}".format(batch, train_score)) 
        tb_writer.add_scalars("reward/", {"train": train_score}, batch)

        log[args.log_name].info("At iteration {}, valid_reward: {:.3f}".format(batch, val_score)) 
        tb_writer.add_scalars("reward/", {"val": val_score}, batch)

        # Save policy
        if val_score > best_score:
            best_score = val_score
            log[args.log_name].info("Saving best valid score: {:.3f}".format(best_score)) 
            with open(policy_filename, 'wb') as f:
                torch.save(policy.state_dict(), f)