示例#1
0
    def prepare_start(self, policy_pickle, env_pickle, baseline_pickle,
                      feed_dict, config, initial_random_samples):
        import tensorflow as tf
        self.sess = sess = tf.Session(config=config)
        with sess.as_default():
            """ --------------------- Construct instances -------------------"""

            from asynch_mb.samplers.sampler import Sampler
            from asynch_mb.samplers.mb_sample_processor import ModelSampleProcessor

            env = pickle.loads(env_pickle)
            policy = pickle.loads(policy_pickle)
            baseline = pickle.loads(baseline_pickle)
            sess.run(tf.initializers.global_variables())

            self.env = env
            self.env_sampler = Sampler(env=env,
                                       policy=policy,
                                       **feed_dict['env_sampler'])
            self.dynamics_sample_processor = ModelSampleProcessor(
                baseline=baseline, **feed_dict['dynamics_sample_processor'])
            """ ------------------- Step and Push ------------------"""

            samples_data = self.step(random=initial_random_samples)
            self.push(samples_data)

        return 1
示例#2
0
    def construct_from_feed_dict(
        self,
        policy_pickle,
        env_pickle,
        baseline_pickle,  # UNUSED
        dynamics_model_pickle,
        feed_dict,
    ):

        from asynch_mb.samplers.sampler import Sampler
        from asynch_mb.samplers.mb_sample_processor import ModelSampleProcessor

        env = pickle.loads(env_pickle)
        policy = pickle.loads(policy_pickle)

        self.env = env
        self.env_sampler = Sampler(env=env,
                                   policy=policy,
                                   **feed_dict['sampler'])
        self.dynamics_sample_processor = ModelSampleProcessor(
            **feed_dict['sample_processor'])
示例#3
0
def run_experiment(**kwargs):
    exp_dir = os.getcwd() + '/data/' + EXP_NAME + kwargs.get('exp_name', '')
    logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last')
    json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = kwargs.get('gpu_frac', 0.95)
    sess = tf.Session(config=config)
    with sess.as_default() as sess:
        # Instantiate classes
        set_seed(kwargs['seed'])

        baseline = kwargs['baseline']()

        env = normalize(kwargs['env']()) # Wrappers?

        policy = GaussianMLPPolicy(
            name="meta-policy",
            obs_dim=np.prod(env.observation_space.shape),
            action_dim=np.prod(env.action_space.shape),
            hidden_sizes=kwargs['policy_hidden_sizes'],
            learn_std=kwargs['policy_learn_std'],
            hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'],
            output_nonlinearity=kwargs['policy_output_nonlinearity'],
        )

        dynamics_model = MLPDynamicsEnsemble('dynamics-ensemble',
                                             env=env,
                                             num_models=kwargs['num_models'],
                                             hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'],
                                             hidden_sizes=kwargs['dynamics_hidden_sizes'],
                                             output_nonlinearity=kwargs['dyanmics_output_nonlinearity'],
                                             learning_rate=kwargs['dynamics_learning_rate'],
                                             batch_size=kwargs['dynamics_batch_size'],
                                             buffer_size=kwargs['dynamics_buffer_size'],
                                             rolling_average_persitency=kwargs['rolling_average_persitency']
                                             )

        env_sampler = Sampler(
            env=env,
            policy=policy,
            num_rollouts=kwargs['num_rollouts'],
            max_path_length=kwargs['max_path_length'],
            n_parallel=kwargs['n_parallel'],
        )

        model_sampler = METRPOSampler(
            env=env,
            policy=policy,
            dynamics_model=dynamics_model,
            num_rollouts=kwargs['imagined_num_rollouts'],
            max_path_length=kwargs['max_path_length'],
            deterministic=kwargs['deterministic'],
        )

        dynamics_sample_processor = ModelSampleProcessor(
            baseline=baseline,
            discount=kwargs['discount'],
            gae_lambda=kwargs['gae_lambda'],
            normalize_adv=kwargs['normalize_adv'],
            positive_adv=kwargs['positive_adv'],
        )

        model_sample_processor = SampleProcessor(
            baseline=baseline,
            discount=kwargs['discount'],
            gae_lambda=kwargs['gae_lambda'],
            normalize_adv=kwargs['normalize_adv'],
            positive_adv=kwargs['positive_adv'],
        )

        algo = TRPO(
            policy=policy,
            step_size=kwargs['step_size'],
        )

        trainer = Trainer(
            algo=algo,
            policy=policy,
            env=env,
            model_sampler=model_sampler,
            env_sampler=env_sampler,
            model_sample_processor=model_sample_processor,
            dynamics_sample_processor=dynamics_sample_processor,
            dynamics_model=dynamics_model,
            n_itr=kwargs['n_itr'],
            dynamics_model_max_epochs=kwargs['dynamics_max_epochs'],
            log_real_performance=kwargs['log_real_performance'],
            steps_per_iter=kwargs['steps_per_iter'],
            sample_from_buffer=kwargs['sample_from_buffer'],
            sess=sess,
        )

        trainer.train()
示例#4
0
class WorkerData(Worker):
    def __init__(self, simulation_sleep, video=False):
        if video:
            super().__init__(snapshot_mode='gap', snapshot_gap=int(30/1250/simulation_sleep))  # FIXME
        else:
            super().__init__()
        self.simulation_sleep = simulation_sleep
        self.env = None
        self.env_sampler = None
        self.dynamics_sample_processor = None
        self.samples_data_arr = []

    def construct_from_feed_dict(
            self,
            policy_pickle,
            env_pickle,
            baseline_pickle,
            dynamics_model_pickle,
            feed_dict
    ):

        from asynch_mb.samplers.sampler import Sampler
        from asynch_mb.samplers.mb_sample_processor import ModelSampleProcessor

        env = pickle.loads(env_pickle)
        policy = pickle.loads(policy_pickle)
        baseline = pickle.loads(baseline_pickle)

        self.env = env
        self.env_sampler = Sampler(env=env, policy=policy, **feed_dict['env_sampler'])
        self.dynamics_sample_processor = ModelSampleProcessor(
            baseline=baseline,
            **feed_dict['dynamics_sample_processor']
        )

    def prepare_start(self):
        initial_random_samples = self.queue.get()
        self.step(initial_random_samples)
        self.push()

    def step(self, random=False):
        time_step = time.time()

        '''------------- Obtaining samples from the environment -----------'''

        if self.verbose:
            logger.log("Data is obtaining samples...")
        env_paths = self.env_sampler.obtain_samples(
            log=True,
            random=random,
            log_prefix='Data-EnvSampler-',
        )

        '''-------------- Processing environment samples -------------------'''

        if self.verbose:
            logger.log("Data is processing environment samples...")
        samples_data = self.dynamics_sample_processor.process_samples(
            env_paths,
            log=True,
            log_prefix='Data-EnvTrajs-',
        )

        self.samples_data_arr.append(samples_data)
        time_step = time.time() - time_step

        time_sleep = max(self.simulation_sleep - time_step, 0)
        time.sleep(time_sleep)

        logger.logkv('Data-TimeStep', time_step)
        logger.logkv('Data-TimeSleep', time_sleep)

        # save snapshot
        params = self.get_itr_snapshot()
        logger.save_itr_params(self.itr_counter, params)

    def _synch(self, policy_state_pickle):
        time_synch = time.time()
        policy_state = pickle.loads(policy_state_pickle)
        assert isinstance(policy_state, dict)
        self.env_sampler.policy.set_shared_params(policy_state)
        time_synch = time.time() - time_synch

        logger.logkv('Data-TimeSynch', time_synch)

    def push(self):
        time_push = time.time()
        self.queue_next.put(pickle.dumps(self.samples_data_arr))
        self.samples_data_arr = []
        time_push = time.time() - time_push

        logger.logkv('Data-TimePush', time_push)

    def set_stop_cond(self):
        if self.itr_counter >= self.n_itr:
            self.stop_cond.set()

    def get_itr_snapshot(self):
        """
        Gets the current policy and env for storage
        """
        return dict(itr=self.itr_counter, policy=self.env_sampler.policy, env=self.env)
示例#5
0
def run_experiment(**kwargs):
    exp_dir = os.getcwd() + '/data/' + EXP_NAME
    logger.configure(dir=exp_dir,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last')
    json.dump(kwargs,
              open(exp_dir + '/params.json', 'w'),
              indent=2,
              sort_keys=True,
              cls=ClassEncoder)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = kwargs.get(
        'gpu_frac', 0.95)
    sess = tf.Session(config=config)
    with sess.as_default() as sess:

        # Instantiate classes
        set_seed(kwargs['seed'])

        baseline = kwargs['baseline']()

        env = normalize(kwargs['env']())

        policy = GaussianMLPPolicy(
            name="policy",
            obs_dim=np.prod(env.observation_space.shape),
            action_dim=np.prod(env.action_space.shape),
            hidden_sizes=kwargs['hidden_sizes'],
            learn_std=kwargs['learn_std'],
            hidden_nonlinearity=kwargs['hidden_nonlinearity'],
            output_nonlinearity=kwargs['output_nonlinearity'],
            init_std=kwargs['init_std'],
            squashed=kwargs['squashed'])

        # Load policy here

        sampler = Sampler(
            env=env,
            policy=policy,
            num_rollouts=kwargs['num_rollouts'],
            max_path_length=kwargs['max_path_length'],
            n_parallel=kwargs['n_parallel'],
        )

        sample_processor = SingleSampleProcessor(
            baseline=baseline,
            discount=kwargs['discount'],
            gae_lambda=kwargs['gae_lambda'],
            normalize_adv=kwargs['normalize_adv'],
            positive_adv=kwargs['positive_adv'],
        )

        algo = PPO(
            policy=policy,
            learning_rate=kwargs['learning_rate'],
            clip_eps=kwargs['clip_eps'],
            max_epochs=kwargs['num_ppo_steps'],
            entropy_bonus=kwargs['entropy_bonus'],
        )

        trainer = Trainer(
            algo=algo,
            policy=policy,
            env=env,
            sampler=sampler,
            sample_processor=sample_processor,
            n_itr=kwargs['n_itr'],
            sess=sess,
        )

        trainer.train()
示例#6
0
class WorkerData(Worker):
    def __init__(self, policy_ps, data_buffers, time_sleep, name, exp_dir,
                 n_itr, stop_cond):
        super().__init__(name, exp_dir, n_itr, stop_cond)
        self.policy_ps = policy_ps
        self.data_buffers = data_buffers
        self.time_sleep = time_sleep
        self.env = None
        self.env_sampler = None
        self.dynamics_sample_processor = None

    def prepare_start(self, policy_pickle, env_pickle, baseline_pickle,
                      feed_dict, config, initial_random_samples):
        import tensorflow as tf
        self.sess = sess = tf.Session(config=config)
        with sess.as_default():
            """ --------------------- Construct instances -------------------"""

            from asynch_mb.samplers.sampler import Sampler
            from asynch_mb.samplers.mb_sample_processor import ModelSampleProcessor

            env = pickle.loads(env_pickle)
            policy = pickle.loads(policy_pickle)
            baseline = pickle.loads(baseline_pickle)
            sess.run(tf.initializers.global_variables())

            self.env = env
            self.env_sampler = Sampler(env=env,
                                       policy=policy,
                                       **feed_dict['env_sampler'])
            self.dynamics_sample_processor = ModelSampleProcessor(
                baseline=baseline, **feed_dict['dynamics_sample_processor'])
            """ ------------------- Step and Push ------------------"""

            samples_data = self.step(random=initial_random_samples)
            self.push(samples_data)

        return 1

    def step_wrapper(self):
        self.pull()
        samples_data = self.step()
        self.push(samples_data)
        return 1, 1

    def step(self, random=False):
        time_step = time.time()
        '''------------- Obtaining samples from the environment -----------'''

        if self.verbose:
            logger.log("Data is obtaining samples...")
        env_paths = self.env_sampler.obtain_samples(
            log=True,
            random=random,
            log_prefix='Data-EnvSampler-',
        )
        '''-------------- Processing environment samples -------------------'''

        if self.verbose:
            logger.log("Data is processing environment samples...")
        samples_data = self.dynamics_sample_processor.process_samples(
            env_paths,
            log=True,
            log_prefix='Data-EnvTrajs-',
        )

        time_step = time.time() - time_step

        time_sleep = max(self.time_sleep - time_step, 0)
        time.sleep(time_sleep)

        logger.logkv('Data-TimeStep', time_step)
        logger.logkv('Data-TimeSleep', time_sleep)

        return samples_data

    def pull(self):
        time_synch = time.time()
        policy_params = ray.get(self.policy_ps.pull.remote())
        assert isinstance(policy_params, dict)
        self.env_sampler.policy.set_shared_params(policy_params)
        logger.logkv('Data-TimePull', time.time() - time_synch)

    def push(self, samples_data):
        time_push = time.time()
        # broadcast samples to all data buffers
        samples_data_id = ray.put(samples_data)
        for data_buffer in self.data_buffers:
            # ray.get(data_buffer.push.remote(samples_data))
            data_buffer.push.remote(samples_data_id)
        logger.logkv('Data-TimePush', time.time() - time_push)

    def set_stop_cond(self):
        if self.step_counter >= self.n_itr:
            ray.get(self.stop_cond.set.remote())
示例#7
0
class WorkerData(Worker):
    def __init__(self, simulation_sleep):
        super().__init__()
        self.simulation_sleep = simulation_sleep
        self.env = None
        self.env_sampler = None
        self.dynamics_sample_processor = None
        self.samples_data_arr = []

    def construct_from_feed_dict(
        self,
        policy_pickle,
        env_pickle,
        baseline_pickle,  # UNUSED
        dynamics_model_pickle,
        feed_dict,
    ):

        from asynch_mb.samplers.sampler import Sampler
        from asynch_mb.samplers.mb_sample_processor import ModelSampleProcessor

        env = pickle.loads(env_pickle)
        policy = pickle.loads(policy_pickle)

        self.env = env
        self.env_sampler = Sampler(env=env,
                                   policy=policy,
                                   **feed_dict['sampler'])
        self.dynamics_sample_processor = ModelSampleProcessor(
            **feed_dict['sample_processor'])

    def prepare_start(self):
        random_sinusoid = self.queue.get()
        self.step(random_sinusoid)
        self.push()

    def step(self, random_sinusoid=(False, False)):
        time_step = time.time()

        if self.itr_counter == 1 and self.env_sampler.policy.dynamics_model.normalization is None:
            if self.verbose:
                logger.log('Data starts first step...')
            self.env_sampler.policy.dynamics_model = pickle.loads(
                self.queue.get())
            if self.verbose:
                logger.log('Data first step done...')
        '''------------- Obtaining samples from the environment -----------'''

        if self.verbose:
            logger.log("Data is obtaining samples...")
        env_paths = self.env_sampler.obtain_samples(
            log=True,
            random=random_sinusoid[0],
            sinusoid=random_sinusoid[1],
            log_prefix='Data-EnvSampler-',
        )
        '''-------------- Processing environment samples -------------------'''

        if self.verbose:
            logger.log("Data is processing samples...")
        samples_data = self.dynamics_sample_processor.process_samples(
            env_paths,
            log=True,
            log_prefix='Data-EnvTrajs-',
        )

        self.samples_data_arr.append(samples_data)
        time_step = time.time() - time_step

        time_sleep = max(self.simulation_sleep - time_step, 0)
        time.sleep(time_sleep)

        logger.logkv('Data-TimeStep', time_step)
        logger.logkv('Data-TimeSleep', time_sleep)

    def _synch(self, dynamics_model_state_pickle):
        time_synch = time.time()
        dynamics_model_state = pickle.loads(dynamics_model_state_pickle)
        assert isinstance(dynamics_model_state, dict)
        self.env_sampler.policy.dynamics_model.set_shared_params(
            dynamics_model_state)
        time_synch = time.time() - time_synch

        logger.logkv('Data-TimeSynch', time_synch)

    def push(self):
        time_push = time.time()
        self.queue_next.put(pickle.dumps(self.samples_data_arr))
        self.samples_data_arr = []
        time_push = time.time() - time_push

        logger.logkv('Data-TimePush', time_push)

    def set_stop_cond(self):
        if self.itr_counter >= self.n_itr:
            self.stop_cond.set()