Exemplo n.º 1
0
def run_experiment(**kwargs):
    exp_dir = os.getcwd() + '/data/' + EXP_NAME
    logger.configure(dir=exp_dir,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last')
    json.dump(kwargs,
              open(exp_dir + '/params.json', 'w'),
              indent=2,
              sort_keys=True,
              cls=ClassEncoder)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = kwargs.get(
        'gpu_frac', 0.95)
    sess = tf.Session(config=config)
    with sess.as_default() as sess:

        # Instantiate classes
        set_seed(kwargs['seed'])

        baseline = kwargs['baseline']()

        env = normalize(kwargs['env']())

        policy = GaussianMLPPolicy(
            name="policy",
            obs_dim=np.prod(env.observation_space.shape),
            action_dim=np.prod(env.action_space.shape),
            hidden_sizes=kwargs['hidden_sizes'],
            learn_std=kwargs['learn_std'],
            hidden_nonlinearity=kwargs['hidden_nonlinearity'],
            output_nonlinearity=kwargs['output_nonlinearity'],
            init_std=kwargs['init_std'],
            squashed=kwargs['squashed'])

        # Load policy here

        sampler = Sampler(
            env=env,
            policy=policy,
            num_rollouts=kwargs['num_rollouts'],
            max_path_length=kwargs['max_path_length'],
            n_parallel=kwargs['n_parallel'],
        )

        sample_processor = SingleSampleProcessor(
            baseline=baseline,
            discount=kwargs['discount'],
            gae_lambda=kwargs['gae_lambda'],
            normalize_adv=kwargs['normalize_adv'],
            positive_adv=kwargs['positive_adv'],
        )

        algo = TRPO(
            policy=policy,
            step_size=kwargs['step_size'],
        )

        trainer = Trainer(
            algo=algo,
            policy=policy,
            env=env,
            sampler=sampler,
            sample_processor=sample_processor,
            n_itr=kwargs['n_itr'],
            sess=sess,
        )

        trainer.train()
Exemplo n.º 2
0
def run_base(exp_dir, **kwargs):
    config = ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = kwargs.get(
        'gpu_frac', 0.95)

    # Instantiate classes
    set_seed(kwargs['seed'])

    baseline = kwargs['baseline']()

    if kwargs['env'] == 'Ant':
        env = normalize(AntEnv())
        simulation_sleep = 0.05 * kwargs['num_rollouts'] * kwargs[
            'max_path_length'] * kwargs['simulation_sleep_frac']
    elif kwargs['env'] == 'HalfCheetah':
        env = normalize(HalfCheetahEnv())
        simulation_sleep = 0.05 * kwargs['num_rollouts'] * kwargs[
            'max_path_length'] * kwargs['simulation_sleep_frac']
    elif kwargs['env'] == 'Hopper':
        env = normalize(HopperEnv())
        simulation_sleep = 0.008 * kwargs['num_rollouts'] * kwargs[
            'max_path_length'] * kwargs['simulation_sleep_frac']
    elif kwargs['env'] == 'Walker2d':
        env = normalize(Walker2dEnv())
        simulation_sleep = 0.008 * kwargs['num_rollouts'] * kwargs[
            'max_path_length'] * kwargs['simulation_sleep_frac']
    else:
        raise NotImplementedError

    policy = GaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        hidden_sizes=kwargs['policy_hidden_sizes'],
        learn_std=kwargs['policy_learn_std'],
        hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'],
        output_nonlinearity=kwargs['policy_output_nonlinearity'],
    )

    dynamics_model = MLPDynamicsEnsemble(
        'dynamics-ensemble',
        env=env,
        num_models=kwargs['num_models'],
        hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'],
        hidden_sizes=kwargs['dynamics_hidden_sizes'],
        output_nonlinearity=kwargs['dyanmics_output_nonlinearity'],
        learning_rate=kwargs['dynamics_learning_rate'],
        batch_size=kwargs['dynamics_batch_size'],
        buffer_size=kwargs['dynamics_buffer_size'],
        rolling_average_persitency=kwargs['rolling_average_persitency'],
    )
    '''-------- dumps and reloads -----------------'''

    baseline_pickle = pickle.dumps(baseline)
    env_pickle = pickle.dumps(env)

    receiver, sender = Pipe()
    p = Process(
        target=init_vars,
        name="init_vars",
        args=(sender, config, policy, dynamics_model),
        daemon=True,
    )
    p.start()
    policy_pickle, dynamics_model_pickle = receiver.recv()
    receiver.close()
    '''-------- following classes depend on baseline, env, policy, dynamics_model -----------'''

    worker_data_feed_dict = {
        'env_sampler': {
            'num_rollouts': kwargs['num_rollouts'],
            'max_path_length': kwargs['max_path_length'],
            'n_parallel': kwargs['n_parallel'],
        },
        'dynamics_sample_processor': {
            'discount': kwargs['discount'],
            'gae_lambda': kwargs['gae_lambda'],
            'normalize_adv': kwargs['normalize_adv'],
            'positive_adv': kwargs['positive_adv'],
        },
    }

    worker_model_feed_dict = {}

    worker_policy_feed_dict = {
        'model_sampler': {
            'num_rollouts': kwargs['imagined_num_rollouts'],
            'max_path_length': kwargs['max_path_length'],
            'deterministic': kwargs['deterministic'],
        },
        'model_sample_processor': {
            'discount': kwargs['discount'],
            'gae_lambda': kwargs['gae_lambda'],
            'normalize_adv': kwargs['normalize_adv'],
            'positive_adv': kwargs['positive_adv'],
        },
        'algo': {
            'learning_rate': kwargs['learning_rate'],
            'clip_eps': kwargs['clip_eps'],
            'max_epochs': kwargs['num_ppo_steps'],
        }
    }

    trainer = ParallelTrainer(
        exp_dir=exp_dir,
        algo_str=kwargs['algo'],
        policy_pickle=policy_pickle,
        env_pickle=env_pickle,
        baseline_pickle=baseline_pickle,
        dynamics_model_pickle=dynamics_model_pickle,
        feed_dicts=[
            worker_data_feed_dict, worker_model_feed_dict,
            worker_policy_feed_dict
        ],
        n_itr=kwargs['n_itr'],
        flags_need_query=kwargs['flags_need_query'],
        config=config,
        simulation_sleep=simulation_sleep,
    )

    trainer.train()
Exemplo n.º 3
0
def run_experiment(**kwargs):
    exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + kwargs.get(
        'exp_name', '')
    logger.configure(dir=exp_dir,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last')
    json.dump(kwargs,
              open(exp_dir + '/params.json', 'w'),
              indent=2,
              sort_keys=True,
              cls=ClassEncoder)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = kwargs.get(
        'gpu_frac', 0.95)
    sess = tf.Session(config=config)
    with sess.as_default() as sess:

        # Instantiate classes
        set_seed(kwargs['seed'])

        baseline = kwargs['baseline']()

        if not kwargs['use_images']:
            env = normalize(kwargs['env']())

        else:
            vae = VAE(latent_dim=8)
            env = image_wrapper(normalize(kwargs['env']()),
                                vae=vae,
                                latent_dim=32)

        policy = NNPolicy(
            name="policy",
            obs_dim=np.prod(env.observation_space.shape),
            action_dim=np.prod(env.action_space.shape),
            hidden_sizes=kwargs['hidden_sizes'],
            normalization=None,
        )

        dynamics_model = MLPDynamicsEnsemble(
            'dynamics-ensemble',
            env=env,
            num_models=kwargs['num_models'],
            hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'],
            hidden_sizes=kwargs['dynamics_hidden_sizes'],
            output_nonlinearity=kwargs['dyanmics_output_nonlinearity'],
            learning_rate=kwargs['dynamics_learning_rate'],
            batch_size=kwargs['dynamics_batch_size'],
            buffer_size=kwargs['dynamics_buffer_size'],
        )

        # dynamics_model = None
        assert kwargs['rollouts_per_policy'] % kwargs['num_models'] == 0

        env_sampler = Sampler(
            env=env,
            policy=policy,
            num_rollouts=kwargs['num_rollouts'],
            max_path_length=kwargs['max_path_length'],
            n_parallel=kwargs['num_rollouts'],
        )

        # TODO: I'm not sure if it works with more than one rollout per model

        model_sampler = ARSSampler(
            env=env,
            policy=policy,
            dynamics_model=dynamics_model,
            rollouts_per_policy=kwargs['rollouts_per_policy'],
            max_path_length=kwargs['horizon'],
            num_deltas=kwargs['num_deltas'],
            n_parallel=1,
        )

        dynamics_sample_processor = ModelSampleProcessor(
            baseline=baseline,
            discount=kwargs['discount'],
            gae_lambda=kwargs['gae_lambda'],
            normalize_adv=kwargs['normalize_adv'],
            positive_adv=kwargs['positive_adv'],
        )

        ars_sample_processor = ARSSamplerProcessor(
            baseline=baseline,
            discount=kwargs['discount'],
            gae_lambda=kwargs['gae_lambda'],
            normalize_adv=kwargs['normalize_adv'],
            positive_adv=kwargs['positive_adv'],
            uncertainty_coeff=kwargs['uncertainty_coeff'])

        algo = RandomSearchOptimizer(policy=policy,
                                     learning_rate=kwargs['learning_rate'],
                                     num_deltas=kwargs['num_deltas'],
                                     percentile=kwargs['percentile'])

        trainer = Trainer(
            algo=algo,
            policy=policy,
            env=env,
            model_sampler=model_sampler,
            env_sampler=env_sampler,
            ars_sample_processor=ars_sample_processor,
            dynamics_sample_processor=dynamics_sample_processor,
            dynamics_model=dynamics_model,
            num_deltas=kwargs['num_deltas'],
            n_itr=kwargs['n_itr'],
            dynamics_model_max_epochs=kwargs['dynamics_max_epochs'],
            log_real_performance=kwargs['log_real_performance'],
            steps_per_iter=kwargs['steps_per_iter'],
            delta_std=kwargs['delta_std'],
            sess=sess,
            initial_random_samples=True,
            sample_from_buffer=kwargs['sample_from_buffer'])

        trainer.train()
Exemplo n.º 4
0
def run_experiment(**kwargs):

    exp_dir = os.getcwd(
    ) + '/data/parallel_mb_ppo/' + EXP_NAME + '/' + kwargs.get('exp_name', '')
    print("\n---------- running experiment {} ---------------------------".
          format(exp_dir))
    logger.configure(dir=exp_dir,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last')
    json.dump(kwargs,
              open(exp_dir + '/params.json', 'w'),
              indent=2,
              sort_keys=True,
              cls=ClassEncoder)
    config = ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = kwargs.get(
        'gpu_frac', 0.95)

    # Instantiate classes
    set_seed(kwargs['seed'])

    baseline = kwargs['baseline']()

    env = normalize(kwargs['env']())  # Wrappers?

    policy = GaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        hidden_sizes=kwargs['policy_hidden_sizes'],
        learn_std=kwargs['policy_learn_std'],
        hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'],
        output_nonlinearity=kwargs['policy_output_nonlinearity'],
    )

    dynamics_model = MLPDynamicsEnsemble(
        'dynamics-ensemble',
        env=env,
        num_models=kwargs['num_models'],
        hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'],
        hidden_sizes=kwargs['dynamics_hidden_sizes'],
        output_nonlinearity=kwargs['dyanmics_output_nonlinearity'],
        learning_rate=kwargs['dynamics_learning_rate'],
        batch_size=kwargs['dynamics_batch_size'],
        buffer_size=kwargs['dynamics_buffer_size'],
    )
    '''-------- dumps and reloads -----------------'''

    baseline_pickle = pickle.dumps(baseline)
    env_pickle = pickle.dumps(env)

    receiver, sender = Pipe()
    p = Process(
        target=init_vars,
        name="init_vars",
        args=(sender, config, policy, dynamics_model),
        daemon=True,
    )
    p.start()
    policy_pickle, dynamics_model_pickle = receiver.recv()
    receiver.close()
    '''-------- following classes depend on baseline, env, policy, dynamics_model -----------'''

    worker_data_feed_dict = {
        'env_sampler': {
            'num_rollouts': kwargs['num_rollouts'],
            'max_path_length': kwargs['max_path_length'],
            'n_parallel': kwargs['n_parallel'],
        },
        'dynamics_sample_processor': {
            'discount': kwargs['discount'],
            'gae_lambda': kwargs['gae_lambda'],
            'normalize_adv': kwargs['normalize_adv'],
            'positive_adv': kwargs['positive_adv'],
        },
    }

    worker_model_feed_dict = {}

    worker_policy_feed_dict = {
        'model_sampler': {
            'num_rollouts': kwargs['imagined_num_rollouts'],
            'max_path_length': kwargs['max_path_length'],
            'dynamics_model': dynamics_model,
            'deterministic': kwargs['deterministic'],
        },
        'model_sample_processor': {
            'discount': kwargs['discount'],
            'gae_lambda': kwargs['gae_lambda'],
            'normalize_adv': kwargs['normalize_adv'],
            'positive_adv': kwargs['positive_adv'],
        },
        'algo': {
            'learning_rate': kwargs['learning_rate'],
            'clip_eps': kwargs['clip_eps'],
            'max_epochs': kwargs['num_ppo_steps'],
        }
    }

    trainer = ParallelTrainer(
        policy_pickle=policy_pickle,
        env_pickle=env_pickle,
        baseline_pickle=baseline_pickle,
        dynamics_model_pickle=dynamics_model_pickle,
        feed_dicts=[
            worker_data_feed_dict, worker_model_feed_dict,
            worker_policy_feed_dict
        ],
        n_itr=kwargs['n_itr'],
        dynamics_model_max_epochs=kwargs['dynamics_max_epochs'],
        log_real_performance=kwargs['log_real_performance'],
        steps_per_iter=kwargs['steps_per_iter'],
        flags_need_query=kwargs['flags_need_query'],
        config=config,
        simulation_sleep=kwargs['simulation_sleep'],
    )

    trainer.train()
Exemplo n.º 5
0
def run_experiment(**kwargs):
    exp_dir = os.getcwd() + '/data/' + EXP_NAME
    logger.configure(dir=exp_dir,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last_gap',
                     snapshot_gap=50)
    json.dump(kwargs,
              open(exp_dir + '/params.json', 'w'),
              indent=2,
              sort_keys=True,
              cls=ClassEncoder)

    # Instantiate classes
    set_seed(kwargs['seed'])

    baseline = kwargs['baseline']()

    env = normalize(kwargs['env']())  # Wrappers?

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),  # Todo...?
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=kwargs['meta_batch_size'],
        hidden_sizes=kwargs['hidden_sizes'],
        learn_std=kwargs['learn_std'],
        hidden_nonlinearity=kwargs['hidden_nonlinearity'],
        output_nonlinearity=kwargs['output_nonlinearity'],
    )

    # Load policy here

    sampler = MAMLSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=kwargs['rollouts_per_meta_task'],
        meta_batch_size=kwargs['meta_batch_size'],
        max_path_length=kwargs['max_path_length'],
        parallel=kwargs['parallel'],
    )

    sample_processor = DiceMAMLSampleProcessor(
        baseline=baseline,
        max_path_length=kwargs['max_path_length'],
        discount=kwargs['discount'],
        normalize_adv=kwargs['normalize_adv'],
        positive_adv=kwargs['positive_adv'],
    )

    algo = DICEMAML(policy=policy,
                    max_path_length=kwargs['max_path_length'],
                    meta_batch_size=kwargs['meta_batch_size'],
                    num_inner_grad_steps=kwargs['num_inner_grad_steps'],
                    inner_lr=kwargs['inner_lr'],
                    learning_rate=kwargs['learning_rate'])

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=kwargs['n_itr'],
        num_inner_grad_steps=kwargs['num_inner_grad_steps'],
    )

    trainer.train()
Exemplo n.º 6
0
def run_experiment(**kwargs):
    exp_dir = os.getcwd() + '/data/' + EXP_NAME + kwargs.get('exp_name', '')
    logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last')
    json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = kwargs.get('gpu_frac', 0.95)
    sess = tf.Session(config=config)
    with sess.as_default() as sess:
        # Instantiate classes
        set_seed(kwargs['seed'])

        baseline = kwargs['baseline']()

        env = normalize(kwargs['env']()) # Wrappers?

        policy = GaussianMLPPolicy(
            name="meta-policy",
            obs_dim=np.prod(env.observation_space.shape),
            action_dim=np.prod(env.action_space.shape),
            hidden_sizes=kwargs['policy_hidden_sizes'],
            learn_std=kwargs['policy_learn_std'],
            hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'],
            output_nonlinearity=kwargs['policy_output_nonlinearity'],
        )

        dynamics_model = MLPDynamicsEnsemble('dynamics-ensemble',
                                             env=env,
                                             num_models=kwargs['num_models'],
                                             hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'],
                                             hidden_sizes=kwargs['dynamics_hidden_sizes'],
                                             output_nonlinearity=kwargs['dyanmics_output_nonlinearity'],
                                             learning_rate=kwargs['dynamics_learning_rate'],
                                             batch_size=kwargs['dynamics_batch_size'],
                                             buffer_size=kwargs['dynamics_buffer_size'],
                                             rolling_average_persitency=kwargs['rolling_average_persitency']
                                             )

        env_sampler = Sampler(
            env=env,
            policy=policy,
            num_rollouts=kwargs['num_rollouts'],
            max_path_length=kwargs['max_path_length'],
            n_parallel=kwargs['n_parallel'],
        )

        model_sampler = METRPOSampler(
            env=env,
            policy=policy,
            dynamics_model=dynamics_model,
            num_rollouts=kwargs['imagined_num_rollouts'],
            max_path_length=kwargs['max_path_length'],
            deterministic=kwargs['deterministic'],
        )

        dynamics_sample_processor = ModelSampleProcessor(
            baseline=baseline,
            discount=kwargs['discount'],
            gae_lambda=kwargs['gae_lambda'],
            normalize_adv=kwargs['normalize_adv'],
            positive_adv=kwargs['positive_adv'],
        )

        model_sample_processor = SampleProcessor(
            baseline=baseline,
            discount=kwargs['discount'],
            gae_lambda=kwargs['gae_lambda'],
            normalize_adv=kwargs['normalize_adv'],
            positive_adv=kwargs['positive_adv'],
        )

        algo = TRPO(
            policy=policy,
            step_size=kwargs['step_size'],
        )

        trainer = Trainer(
            algo=algo,
            policy=policy,
            env=env,
            model_sampler=model_sampler,
            env_sampler=env_sampler,
            model_sample_processor=model_sample_processor,
            dynamics_sample_processor=dynamics_sample_processor,
            dynamics_model=dynamics_model,
            n_itr=kwargs['n_itr'],
            dynamics_model_max_epochs=kwargs['dynamics_max_epochs'],
            log_real_performance=kwargs['log_real_performance'],
            steps_per_iter=kwargs['steps_per_iter'],
            sample_from_buffer=kwargs['sample_from_buffer'],
            sess=sess,
        )

        trainer.train()
Exemplo n.º 7
0
def run_experiment(**config):
    exp_dir = os.getcwd() + '/data/' + EXP_NAME
    logger.configure(dir=exp_dir,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last_gap',
                     snapshot_gap=50)
    json.dump(config,
              open(exp_dir + '/params.json', 'w'),
              indent=2,
              sort_keys=True,
              cls=ClassEncoder)

    # Instantiate classes
    set_seed(config['seed'])

    baseline = config['baseline']()

    env = normalize(config['env']())  # Wrappers?

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
        learn_std=config['learn_std'],
        hidden_nonlinearity=config['hidden_nonlinearity'],
        output_nonlinearity=config['output_nonlinearity'],
    )

    # Load policy here

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config['rollouts_per_meta_task'],
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = MAMLSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
        positive_adv=config['positive_adv'],
    )

    algo = PPOMAML(
        policy=policy,
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        learning_rate=config['learning_rate'],
        num_ppo_steps=config['num_ppo_steps'],
        num_minibatches=config['num_minibatches'],
        clip_eps=config['clip_eps'],
        clip_outer=config['clip_outer'],
        target_outer_step=config['target_outer_step'],
        target_inner_step=config['target_inner_step'],
        init_outer_kl_penalty=config['init_outer_kl_penalty'],
        init_inner_kl_penalty=config['init_inner_kl_penalty'],
        adaptive_outer_kl_penalty=config['adaptive_outer_kl_penalty'],
        adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
        anneal_factor=config['anneal_factor'],
    )

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
    )

    trainer.train()
Exemplo n.º 8
0
def run_experiment(**kwargs):
    exp_dir = os.getcwd() + '/data/' + EXP_NAME
    logger.configure(dir=exp_dir,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last_gap',
                     snapshot_gap=50)
    json.dump(kwargs,
              open(exp_dir + '/params.json', 'w'),
              indent=2,
              sort_keys=True,
              cls=ClassEncoder)

    # Instantiate classes
    set_seed(kwargs['seed'])
    sess = tf.Session()

    with sess.as_default() as sess:
        config = json.load(open(osp.join(kwargs['path'], 'params.json'), 'r'))
        data = joblib.load(osp.join(kwargs['path'], 'params.pkl'))
        policy = data['policy']
        env = data['env']
        baseline = data['baseline']

        if kwargs['rollouts_per_meta_task'] is None:
            rollouts_per_meta_task = int(
                np.ceil(config['rollouts_per_meta_task'] /
                        config['meta_batch_size']))
        else:
            rollouts_per_meta_task = kwargs['rollouts_per_meta_task']

        sampler = MAMLSampler(
            env=env,
            policy=policy,
            rollouts_per_meta_task=rollouts_per_meta_task,
            meta_batch_size=config['meta_batch_size'],
            max_path_length=kwargs['max_path_length'],
            parallel=kwargs['parallel'],
        )

        sample_processor = SampleProcessor(
            baseline=baseline,
            discount=config['discount'],
            normalize_adv=config['normalize_adv'],
            positive_adv=config['positive_adv'],
        )

        algo = VPG(
            policy=policy,
            learning_rate=config['inner_lr'],
        )

        tester = Tester(
            algo=algo,
            policy=policy,
            env=env,
            sampler=sampler,
            sample_processor=sample_processor,
            n_itr=kwargs['n_itr'],
            sess=sess,
            task=None,
        )

        tester.train()
Exemplo n.º 9
0
def run_experiment(**kwargs):

    num = Num()
    exp_name = EXP_NAME + str(num.EXP_NUM)

    exp_dir = os.getcwd() + '/data/video_peg/' + EXP_NAME + kwargs.get(
        'exp_name', '')
    logger.configure(dir=exp_dir,
                     format_strs=['csv', 'stdout', 'log'],
                     snapshot_mode='all')  #change to all
    json.dump(kwargs,
              open(exp_dir + '/params.json', 'w'),
              indent=2,
              sort_keys=True,
              cls=ClassEncoder)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = kwargs.get(
        'gpu_frac', 0.95)
    sess = tf.Session(config=config)
    Num.EXP_NUM += 1
    with sess.as_default() as sess:

        # Instantiate classesLogger
        set_seed(kwargs['seed'])

        baseline = kwargs['baseline']()

        env = normalize(kwargs['env']())  # Wrappers?

        policy = MetaGaussianMLPPolicy(
            name="meta-policy",
            obs_dim=np.prod(env.observation_space.shape),
            action_dim=np.prod(env.action_space.shape),
            meta_batch_size=kwargs['meta_batch_size'],
            hidden_sizes=kwargs['policy_hidden_sizes'],
            learn_std=kwargs['policy_learn_std'],
            hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'],
            output_nonlinearity=kwargs['policy_output_nonlinearity'],
        )

        dynamics_model = MLPDynamicsEnsemble(
            'dynamics-ensemble',
            env=env,
            num_models=kwargs['num_models'],
            hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'],
            hidden_sizes=kwargs['dynamics_hidden_sizes'],
            output_nonlinearity=kwargs['dyanmics_output_nonlinearity'],
            learning_rate=kwargs['dynamics_learning_rate'],
            batch_size=kwargs['dynamics_batch_size'],
            buffer_size=kwargs['dynamics_buffer_size'],
        )

        env_sampler = BaseSampler(
            env=env,
            policy=policy,
            # rollouts_per_meta_task=kwargs['real_env_rollouts_per_meta_task'],
            num_rollouts=kwargs['meta_batch_size'],
            max_path_length=kwargs['max_path_length'],
            sleep_reset=2.5,
            #parallel=kwargs['parallel'],
            # parallel=False
        )

        model_sampler = MBMPOSampler(
            env=env,
            policy=policy,
            rollouts_per_meta_task=kwargs['rollouts_per_meta_task'],
            meta_batch_size=kwargs['meta_batch_size'],
            max_path_length=kwargs['max_path_length'],
            dynamics_model=dynamics_model,
            deterministic=kwargs['deterministic'],
        )

        dynamics_sample_processor = ModelSampleProcessor(
            baseline=baseline,
            discount=kwargs['discount'],
            gae_lambda=kwargs['gae_lambda'],
            normalize_adv=kwargs['normalize_adv'],
            positive_adv=kwargs['positive_adv'],
        )

        model_sample_processor = MAMLSampleProcessor(
            baseline=baseline,
            discount=kwargs['discount'],
            gae_lambda=kwargs['gae_lambda'],
            normalize_adv=kwargs['normalize_adv'],
            positive_adv=kwargs['positive_adv'],
        )

        algo = TRPOMAML(
            policy=policy,
            step_size=kwargs['step_size'],
            inner_type=kwargs['inner_type'],
            inner_lr=kwargs['inner_lr'],
            meta_batch_size=kwargs['meta_batch_size'],
            num_inner_grad_steps=kwargs['num_inner_grad_steps'],
            exploration=kwargs['exploration'],
        )

        trainer = Trainer(
            algo=algo,
            policy=policy,
            env=env,
            model_sampler=model_sampler,
            env_sampler=env_sampler,
            model_sample_processor=model_sample_processor,
            dynamics_sample_processor=dynamics_sample_processor,
            dynamics_model=dynamics_model,
            n_itr=kwargs['n_itr'],
            num_inner_grad_steps=kwargs['num_inner_grad_steps'],
            dynamics_model_max_epochs=kwargs['dynamics_max_epochs'],
            log_real_performance=kwargs['log_real_performance'],
            meta_steps_per_iter=kwargs['meta_steps_per_iter'],
            sample_from_buffer=True,
            sess=sess,
        )

        trainer.train()
Exemplo n.º 10
0
def run_experiment(**kwargs):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = kwargs.get(
        'gpu_frac', 0.95)
    sess = tf.Session(config=config)

    with sess.as_default() as sess:
        exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + kwargs.get(
            'exp_name', '')
        logger.configure(dir=exp_dir,
                         format_strs=['stdout', 'log', 'csv'],
                         snapshot_mode='last')
        json.dump(kwargs,
                  open(exp_dir + '/params.json', 'w'),
                  indent=2,
                  sort_keys=True,
                  cls=ClassEncoder)

        # Instantiate classes
        set_seed(kwargs['seed'])

        env = normalize(kwargs['env']())  # Wrappers?

        baseline = NNValueFun(
            'value-function',
            env,
            hidden_nonlinearity=kwargs['vfun_hidden_nonlinearity'],
            hidden_sizes=kwargs['vfun_hidden_sizes'],
            output_nonlinearity=kwargs['vfun_output_nonlinearity'],
            learning_rate=kwargs['vfun_learning_rate'],
            batch_size=kwargs['vfun_batch_size'],
            buffer_size=kwargs['vfun_buffer_size'],
            normalize_input=False,
        )

        policy = GaussianMLPPolicy(
            name="policy",
            obs_dim=np.prod(env.observation_space.shape),
            action_dim=np.prod(env.action_space.shape),
            hidden_sizes=kwargs['policy_hidden_sizes'],
            learn_std=kwargs['policy_learn_std'],
            output_nonlinearity=kwargs['policy_output_nonlinearity'],
        )

        dynamics_model = MLPDynamicsModel(
            'prob-dynamics',
            env=env,
            hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'],
            hidden_sizes=kwargs['dynamics_hidden_sizes'],
            output_nonlinearity=kwargs['dyanmics_output_nonlinearity'],
            learning_rate=kwargs['dynamics_learning_rate'],
            batch_size=kwargs['dynamics_batch_size'],
            buffer_size=kwargs['dynamics_buffer_size'],
            normalize_input=False,
        )

        assert kwargs['num_rollouts'] % kwargs['n_parallel'] == 0

        sampler = Sampler(
            env=env,
            policy=policy,
            num_rollouts=kwargs['num_rollouts'],
            max_path_length=kwargs['max_path_length'],
            n_parallel=kwargs['n_parallel'],
        )

        sample_processor = ModelSampleProcessor(
            baseline=baseline,
            discount=kwargs['discount'],
            gae_lambda=kwargs['gae_lambda'],
            normalize_adv=kwargs['normalize_adv'],
            positive_adv=kwargs['positive_adv'],
        )

        algo = SVG1(
            policy=policy,
            dynamics_model=dynamics_model,
            value_function=baseline,
            tf_reward=env.tf_reward,
            learning_rate=kwargs['svg_learning_rate'],
            num_grad_steps=kwargs['num_rollouts'] *
            kwargs['max_path_length'] // kwargs['svg_batch_size'],
            batch_size=kwargs['svg_batch_size'],
            discount=kwargs['discount'],
            kl_penalty=kwargs['kl_penalty'],
        )

        trainer = Trainer(
            algo=algo,
            policy=policy,
            env=env,
            sampler=sampler,
            sample_processor=sample_processor,
            dynamics_model=dynamics_model,
            value_function=baseline,
            n_itr=kwargs['n_itr'],
            dynamics_model_max_epochs=kwargs['dynamics_max_epochs'],
            vfun_max_epochs=kwargs['vfun_max_epochs'],
            sess=sess,
        )

        trainer.train()
Exemplo n.º 11
0
def run_experiment(**kwargs):
    exp_dir = os.getcwd() + '/data/' + EXP_NAME
    logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50)
    json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder)

    # Instantiate classes
    set_seed(kwargs['seed'])

    baseline = kwargs['baseline']()

    env = normalize(kwargs['env']()) # Wrappers?

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=kwargs['meta_batch_size'],
        hidden_sizes=kwargs['policy_hidden_sizes'],
        learn_std=kwargs['policy_learn_std'],
        hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'],
        output_nonlinearity=kwargs['policy_output_nonlinearity'],
    )

    dynamics_model = MLPDynamicsEnsemble('dynamics-ensemble',
                                         env=env,
                                         num_models=kwargs['num_models'],
                                         hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'],
                                         hidden_sizes=kwargs['dynamics_hidden_sizes'],
                                         output_nonlinearity=kwargs['dyanmics_output_nonlinearity'],
                                         learning_rate=kwargs['dynamics_learning_rate'],
                                         batch_size=kwargs['dynamics_batch_size'],
                                         buffer_size=kwargs['dynamics_buffer_size'],

                                         )
    env_sampler = SingleMetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=kwargs['real_env_rollouts_per_meta_task'],
        meta_batch_size=kwargs['meta_batch_size'],
        max_path_length=kwargs['max_path_length'],
        parallel=kwargs['parallel'],
    )

    model_sampler = MBMPOSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=kwargs['rollouts_per_meta_task'],
        meta_batch_size=kwargs['meta_batch_size'],
        max_path_length=kwargs['max_path_length'],
        dynamics_model=dynamics_model,
    )

    dynamics_sample_processor = ModelSampleProcessor(
        baseline=baseline,
        discount=kwargs['discount'],
        gae_lambda=kwargs['gae_lambda'],
        normalize_adv=kwargs['normalize_adv'],
        positive_adv=kwargs['positive_adv'],
    )

    model_sample_processor = MAMLSampleProcessor(
        baseline=baseline,
        discount=kwargs['discount'],
        gae_lambda=kwargs['gae_lambda'],
        normalize_adv=kwargs['normalize_adv'],
        positive_adv=kwargs['positive_adv'],
    )

    algo = TRPOMAML(
        policy=policy,
        step_size=kwargs['step_size'],
        inner_type=kwargs['inner_type'],
        inner_lr=kwargs['inner_lr'],
        meta_batch_size=kwargs['meta_batch_size'],
        num_inner_grad_steps=kwargs['num_inner_grad_steps'],
        exploration=kwargs['exploration'],
    )

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        model_sampler=model_sampler,
        env_sampler=env_sampler,
        model_sample_processor=model_sample_processor,
        dynamics_sample_processor=dynamics_sample_processor,
        dynamics_model=dynamics_model,
        n_itr=kwargs['n_itr'],
        num_inner_grad_steps=kwargs['num_inner_grad_steps'],
        dynamics_model_max_epochs=kwargs['dynamics_max_epochs'],
        log_real_performance=kwargs['log_real_performance'],
        meta_steps_per_iter=kwargs['meta_steps_per_iter'],
        initial_random_samples=True,
        sample_from_buffer=True,
    )

    trainer.train()