Exemplo n.º 1
0
    def __init__(self,
                 index,
                 env_name,
                 env_kwargs,
                 batch_size,
                 observation_space,
                 action_space,
                 policy,
                 baseline,
                 seed,
                 task_queue,
                 train_queue,
                 valid_queue,
                 policy_lock):
        super(SamplerWorker, self).__init__()

        # 为 batch 中 batch_size 个任务都创建环境 (num_batches * batch_size)
        env_fns = [make_env(env_name, env_kwargs=env_kwargs)
                   for _ in range(batch_size)]
        self.envs = SyncVectorEnv(env_fns,
                                  observation_space=observation_space,
                                  action_space=action_space)
        self.envs.seed(None if (seed is None) else seed + index * batch_size)
        self.batch_size = batch_size
        self.policy = policy
        self.baseline = baseline

        self.task_queue = task_queue
        self.train_queue = train_queue
        self.valid_queue = valid_queue
        self.policy_lock = policy_lock
    def __init__(self,
                 env_name,
                 env_kwargs,
                 batch_size,
                 observation_space,
                 action_space,
                 policy,
                 baseline,
                 seed,
                 prior_policy,
                 task):

        # 为 batch 中 batch_size 个任务都创建环境 (num_batches * batch_size)
        env_fns = [make_env(env_name, env_kwargs=env_kwargs)
                   for _ in range(batch_size)]

        self.envs = SyncVectorEnv(env_fns,
                                  observation_space=observation_space,
                                  action_space=action_space)
        self.envs.seed(None if (seed is None) else seed + batch_size)
        self.batch_size = batch_size
        self.policy = policy
        self.baseline = baseline

        self.envs.reset_task(task)
        self.task = task
Exemplo n.º 3
0
    def __init__(self,
                 env_name,
                 env_kwargs,
                 batch_size,
                 num_tasks,
                 policy,
                 baseline,
                 env=None,
                 seed=None):

        baseline = deepcopy(baseline)
        # 为 batch 中 batch_size 个任务都创建环境 (num_batches * batch_size)

        # env_fns = [make_env(env_name, env_kwargs=env_kwargs)
        #            for _ in range(batch_size)]
        # self.envs = SyncVectorEnv(env_fns,
        #                           observation_space=env.observation_space,
        #                           action_space=env.action_space)
        # self.envs.seed(None if (seed is None) else seed + index * batch_size)
        self.tasks = self.sample_tasks(env, num_tasks)
        env_fns = [make_env(env_name, env_kwargs=env_kwargs)]
        self.env = SyncVectorEnv(env_fns,
                                 observation_space=env.observation_space,
                                 action_space=env.action_space)
        self.env.seed(None if (seed is None) else seed)

        # self.env = env
        # self.env.seed(None if (seed is None) else seed)
        self.batch_size = batch_size
        self.policy = policy
        self.baseline = baseline
Exemplo n.º 4
0
def main(args):
    with open(args.config, 'r') as f:
        config = json.load(f)

    if args.seed is not None:
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    env = make_env(config['env-name'])()
    env.close()

    # Policy
    policy = get_policy_for_env(env,
                                hidden_sizes=config['hidden-sizes'],
                                nonlinearity=config['nonlinearity'])
    with open(args.policy, 'rb') as f:
        state_dict = torch.load(f, map_location=torch.device(args.device))
        policy.load_state_dict(state_dict)
    policy.share_memory()

    # Baseline
    baseline = LinearFeatureBaseline(get_input_size(env))

    # Sampler
    sampler = MultiTaskSampler(config['env-name'],
                               env_kwargs=None,
                               batch_size=config['fast-batch-size'],
                               policy=policy,
                               baseline=baseline,
                               env=env,
                               seed=args.seed,
                               num_workers=args.num_workers,
                               is_meta_test=True)

    logs = {'tasks': []}
    train_returns, valid_returns = [], []
    for batch in range(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)

        train_episodes, valid_episodes = sampler.sample(
            tasks,
            num_steps=config['num-steps'],
            fast_lr=config['fast-lr'],
            gamma=config['gamma'],
            gae_lambda=config['gae-lambda'],
            device=args.device)

        logs['tasks'].extend(tasks)
        train_returns.append(get_returns(train_episodes[0]))
        print("train:", np.mean(get_returns(train_episodes[0])))
        valid_returns.append(get_returns(valid_episodes))
        print("valid:", np.mean(get_returns(valid_episodes)))

    logs['train_returns'] = np.concatenate(train_returns, axis=0)
    logs['valid_returns'] = np.concatenate(valid_returns, axis=0)

    with open(args.output, 'wb') as f:
        np.savez(f, **logs)
Exemplo n.º 5
0
def main(args):
    with open(args.config, 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

    if args.output_folder is not None:
        if not os.path.exists(args.output_folder):
            os.makedirs(args.output_folder)
        policy_filename = os.path.join(args.output_folder, 'policy.th')
        config_filename = os.path.join(args.output_folder, 'config.json')

        with open(config_filename, 'w') as f:
            config.update(vars(args))
            json.dump(config, f, indent=2)

    # Set tb_writer
    args.log_name = "env-name::%s_num-steps::%s_fast-lr::%s_log" % (
        config["env-name"], config["num-steps"], config["fast-lr"])
    tb_writer = SummaryWriter("./{0}/tb_{1}_logs".format(args.output_folder, args.log_name))
    log = set_log(args)

    # Set seed
    if args.seed is not None:
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    env = make_env(config["env-name"])()
    env.close()

    # Policy
    policy = get_policy_for_env(
        env,
        hidden_sizes=config['hidden-sizes'],
        nonlinearity=config['nonlinearity'])
    policy.share_memory()

    # Baseline
    baseline = LinearFeatureBaseline(get_input_size(env))

    # Sampler
    sampler = MultiTaskSampler(
        config['env-name'],
        env_kwargs=config.get('env-kwargs', {}),
        batch_size=config['fast-batch-size'],
        policy=policy,
        baseline=baseline,
        env=env,
        seed=args.seed,
        num_workers=args.num_workers)

    metalearner = MAMLTRPO(
        policy,
        fast_lr=config['fast-lr'],
        first_order=config['first-order'],
        device=args.device)

    best_score = -np.inf

    for batch in range(config['num-batches']):
        tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size'])

        futures = sampler.sample_async(
            tasks,
            num_steps=config['num-steps'],
            fast_lr=config['fast-lr'],
            gamma=config['gamma'],
            gae_lambda=config['gae-lambda'],
            device=args.device)

        metalearner.step(
            *futures,
            max_kl=config['max-kl'],
            cg_iters=config['cg-iters'],
            cg_damping=config['cg-damping'],
            ls_max_steps=config['ls-max-steps'],
            ls_backtrack_ratio=config['ls-backtrack-ratio'])

        # For logging
        train_episodes, valid_episodes = sampler.sample_wait(futures)
        train_score = np.mean(get_returns(train_episodes[0]))
        val_score = np.mean(get_returns(valid_episodes))

        log[args.log_name].info("At iteration {}, train_reward: {:.3f}".format(batch, train_score)) 
        tb_writer.add_scalars("reward/", {"train": train_score}, batch)

        log[args.log_name].info("At iteration {}, valid_reward: {:.3f}".format(batch, val_score)) 
        tb_writer.add_scalars("reward/", {"val": val_score}, batch)

        # Save policy
        if val_score > best_score:
            best_score = val_score
            log[args.log_name].info("Saving best valid score: {:.3f}".format(best_score)) 
            with open(policy_filename, 'wb') as f:
                torch.save(policy.state_dict(), f)