示例#1
0
def run_experiment(**kwargs):
    exp_dir = os.getcwd() + '/data/' + EXP_NAME
    logger.configure(dir=exp_dir,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last_gap',
                     snapshot_gap=50)
    json.dump(kwargs,
              open(exp_dir + '/params.json', 'w'),
              indent=2,
              sort_keys=True,
              cls=ClassEncoder)

    # Instantiate classes
    set_seed(kwargs['seed'])

    baseline = kwargs['baseline']()

    env = normalize(kwargs['env']())  # Wrappers?

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),  # Todo...?
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=kwargs['meta_batch_size'],
        hidden_sizes=kwargs['hidden_sizes'],
        learn_std=kwargs['learn_std'],
        hidden_nonlinearity=kwargs['hidden_nonlinearity'],
        output_nonlinearity=kwargs['output_nonlinearity'],
    )

    # Load policy here

    sampler = MAMLSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=kwargs['rollouts_per_meta_task'],
        meta_batch_size=kwargs['meta_batch_size'],
        max_path_length=kwargs['max_path_length'],
        parallel=kwargs['parallel'],
        envs_per_task=1,
    )

    sample_processor = MAMLSampleProcessor(
        baseline=baseline,
        discount=kwargs['discount'],
        gae_lambda=kwargs['gae_lambda'],
        normalize_adv=kwargs['normalize_adv'],
        positive_adv=kwargs['positive_adv'],
    )

    algo = TRPOMAML(
        policy=policy,
        step_size=kwargs['step_size'],
        inner_type=kwargs['experiment_tuple'][1],
        inner_lr=kwargs['inner_lr'],
        meta_batch_size=kwargs['meta_batch_size'],
        num_inner_grad_steps=kwargs['num_inner_grad_steps'],
        exploration=kwargs['experiment_tuple'][2],
    )

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=kwargs['n_itr'],
        num_inner_grad_steps=kwargs['num_inner_grad_steps'],
    )

    trainer.train()
示例#2
0
            True,  # whether to learn the standard deviation of the gaussian policy

            # ProMP config
            'inner_lr': 0.1,  # adaptation step size
            'learning_rate': 1e-3,  # meta-policy gradient step size
            'num_promp_steps': 5,  # number of ProMp steps without re-sampling
            'clip_eps': 0.3,  # clipping range
            'target_inner_step': 0.01,
            'init_inner_kl_penalty': 5e-4,
            'adaptive_inner_kl_penalty':
            False,  # whether to use an adaptive or fixed KL-penalty coefficient
            'n_itr': 1001,  # number of overall training iterations
            'meta_batch_size':
            40,  # number of sampled meta-tasks per iterations
            'num_inner_grad_steps':
            1,  # number of inner / adaptation gradient steps
        }

    # configure logger
    logger.configure(dir=args.dump_path,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last_gap')

    # dump run configuration before starting training
    json.dump(config,
              open(args.dump_path + '/params.json', 'w'),
              cls=ClassEncoder)

    # start the actual algorithm
    main(config)
示例#3
0
def main(args=None):
    idx = int(time.time())
    args = parse_args(args)

    config = {
        'seed': args.seed,
        'baseline': 'LinearFeatureBaseline',
        'env': 'ReachWorld',  # not used
        'rollouts_per_meta_task': args.rollout_per_meta_task,
        'max_path_length': args.max_path_length,  # 100
        'parallel': not args.seq,
        'discount': args.discount,
        'gae_lambda': args.gae_lambda,
        'normalize_adv': True,
        'hidden_sizes': args.hidden_sizes,
        'inner_lr': args.inner_lr,  # adaptation step size
        'learning_rate': args.learning_rate,  # meta-policy gradient step size
        'num_promp_steps':
        args.num_promp_steps,  # number of ProMp steps without re-sampling
        'clip_eps': args.clip_eps,  # clipping range
        'target_inner_step': args.target_inner_step,
        'init_inner_kl_penalty': args.init_inner_kl_penalty,
        'adaptive_inner_kl_penalty': args.
        adaptive_inner_kl_penalty,  # whether to use an adaptive or fixed KL-penalty coefficient
        'n_itr': args.n_itr,  # number of overall training iterations
        'meta_batch_size':
        args.meta_batch_size,  # number of sampled meta-tasks per iterations
        'num_inner_grad_steps': args.
        num_inner_grad_steps,  # number of inner / adaptation gradient steps
    }

    # configure logger
    logger.configure(dir=args.dump_path,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last_gap')

    # dump run configuration before starting training
    json.dump(config,
              open(args.dump_path + '/params.json', 'w'),
              cls=ClassEncoder)

    set_seed(config['seed'])

    baseline = globals()[config['baseline']]()  #instantiate baseline

    env = get_env()
    #env = normalize(env) # apply normalize wrapper to env

    if isinstance(env.action_space, gym.spaces.Box):
        action_dim = np.prod(env.action_space.shape)
    elif isinstance(env.action_space, gym.spaces.Discrete):
        action_dim = env.action_space.n
    else:
        raise Exception('unknown action space, cannot get action dim')

    policy = MetaCategoricalMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=action_dim,
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
    )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config[
            'rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = ProMP(
        policy=policy,
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        learning_rate=config['learning_rate'],
        num_ppo_steps=config['num_promp_steps'],
        clip_eps=config['clip_eps'],
        target_inner_step=config['target_inner_step'],
        init_inner_kl_penalty=config['init_inner_kl_penalty'],
        adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
    )

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
    )

    trainer.train()