Exemplo n.º 1
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        start_time = time.time()
        rollout_time = 0
        success_rate = 0
        itrs_on_level = 0

        for itr in range(self.start_itr, self.n_itr):

            # high_entropy = False
            # # At the very beginning of a new level, have high entropy
            # if itrs_on_level < 3:
            #     high_entropy = True
            # # At the very beginning, have high entropy
            # if self.curriculum_step == 0 and itrs_on_level < 20:
            #     high_entropy = True
            # # If the model never succeeds
            # if itrs_on_level > 20 and success_rate < .2:
            #     high_entropy = True
            # logger.logkv("HighE", high_entropy)
            logger.logkv("ItrsOnLEvel", itrs_on_level)
            itrs_on_level += 1

            itr_start_time = time.time()
            logger.log("\n ---------------- Iteration %d ----------------" % itr)
            logger.log("Sampling set of tasks/goals for this meta-batch...")

            """ -------------------- Sampling --------------------------"""

            logger.log("Obtaining samples...")
            time_env_sampling_start = time.time()
            samples_data, episode_logs = self.algo.collect_experiences(self.teacher_train_dict)
            assert len(samples_data.action.shape) == 1, (samples_data.action.shape)
            time_collection = time.time() - time_env_sampling_start
            time_training_start = time.time()
            # if high_entropy:
            #     entropy = self.algo.entropy_coef# * 10
            # else:
            #     entropy = self.algo.entropy_coef
            summary_logs = self.algo.optimize_policy(samples_data, teacher_dict=self.teacher_train_dict)
            time_training = time.time() - time_training_start
            self._log(episode_logs, summary_logs, tag="Train")
            logger.logkv('Curriculum Step', self.curriculum_step)
            advance_curriculum = self.check_advance_curriculum(episode_logs, summary_logs)
            logger.logkv('Train/Advance', int(advance_curriculum))
            time_env_sampling = time.time() - time_env_sampling_start
            #
            # """ ------------------ Reward Predictor Splicing ---------------------"""
            rp_start_time = time.time()
            # samples_data = self.use_reward_predictor(samples_data)  # TODO: update
            rp_splice_time = time.time() - rp_start_time

            """ ------------------ End Reward Predictor Splicing ---------------------"""

            """ ------------------ Policy Update ---------------------"""

            logger.log("Optimizing policy...")
            # # This needs to take all samples_data so that it can construct graph for meta-optimization.
            time_rp_train_start = time.time()
            # self.train_rp(samples_data)
            time_rp_train = time.time() - time_rp_train_start

            """ ------------------ Distillation ---------------------"""
            if False:  # TODO: remove False once I've checked things work with dictionaries
            # if self.supervised_model is not None and advance_curriculum:
                time_distill_start = time.time()
                for _ in range(3):  # TODO: tune this!
                    distill_log = self.distill(samples_data, is_training=True)
                for k, v in distill_log.items():
                    logger.logkv(f"Distill/{k}_Train", v)
                distill_time = time.time() - time_distill_start
                advance_curriculum = distill_log['Accuracy'] >= self.accuracy_threshold
                logger.logkv('Distill/Advance', int(advance_curriculum))
            else:
                distill_time = 0

            """ ------------------ Policy rollouts ---------------------"""
            run_policy_time = 0
            if False:  # TODO: remove False once I've checkeo things work with dictionaries
            # if advance_curriculum or (itr % self.eval_every == 0) or (itr == self.n_itr - 1):  # TODO: collect rollouts with and without the teacher
                train_advance_curriculum = advance_curriculum
                with torch.no_grad():
                    if self.supervised_model is not None:
                        # Distilled model
                        time_run_supervised_start = time.time()
                        self.sampler.supervised_model.reset(dones=[True] * len(samples_data.obs))
                        logger.log("Running supervised model")
                        advance_curriculum_sup = self.run_supervised(self.il_trainer.acmodel, self.no_teacher_dict, "DRollout/")
                        run_supervised_time = time.time() - time_run_supervised_start
                    else:
                        run_supervised_time = 0
                        advance_curriculum_sup = True
                    # Original Policy
                    time_run_policy_start = time.time()
                    self.algo.acmodel.reset(dones=[True] * len(samples_data.obs))
                    logger.log("Running model with teacher")
                    advance_curriculum_policy = self.run_supervised(self.algo.acmodel, self.teacher_train_dict, "Rollout/")
                    run_policy_time = time.time() - time_run_policy_start

                    advance_curriculum = advance_curriculum_policy and advance_curriculum_sup and train_advance_curriculum
                    print("ADvancing curriculum???", advance_curriculum)

                    logger.logkv('Advance', int(advance_curriculum))
            else:
                run_supervised_time = 0
                # advance_curriculum = False

            """ ------------------- Logging Stuff --------------------------"""
            logger.logkv('Itr', itr)
            logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled)

            logger.logkv('Time/Total', time.time() - start_time)
            logger.logkv('Time/Itr', time.time() - itr_start_time)

            logger.logkv('Curriculum Percent', self.curriculum_step / len(self.env.levels_list))

            process = psutil.Process(os.getpid())
            memory_use = process.memory_info().rss / float(2 ** 20)
            logger.logkv('Memory MiB', memory_use)

            logger.log(self.exp_name)

            logger.logkv('Time/Sampling', time_env_sampling)
            logger.logkv('Time/Training', time_training)
            logger.logkv('Time/Collection', time_collection)
            logger.logkv('Time/RPUse', rp_splice_time)
            logger.logkv('Time/RPTrain', time_rp_train)
            logger.logkv('Time/RunwTeacher', run_policy_time)
            logger.logkv('Time/Distillation', distill_time)
            logger.logkv('Time/RunDistilled', run_supervised_time)
            logger.logkv('Time/VidRollout', rollout_time)
            logger.dumpkvs()

            """ ------------------ Video Saving ---------------------"""

            should_save_video = (itr % self.save_videos_every == 0) or (itr == self.n_itr - 1) or advance_curriculum
            if should_save_video:
            # TODO: consider generating videos with every element in the powerset of feedback types
                time_rollout_start = time.time()
                if self.supervised_model is not None:
                    self.il_trainer.acmodel.reset(dones=[True])
                    self.save_videos(itr, self.il_trainer.acmodel, save_name='distilled_video_stoch',
                                   num_rollouts=5,
                                   teacher_dict=self.teacher_train_dict,
                                   save_video=should_save_video, log_prefix="DVidRollout/Stoch", stochastic=True)
                    self.il_trainer.acmodel.reset(dones=[True])
                    self.save_videos(itr, self.il_trainer.acmodel, save_name='distilled_video_det',
                                     num_rollouts=5,
                                   teacher_dict=self.no_teacher_dict,
                                     save_video=should_save_video, log_prefix="DVidRollout/Det", stochastic=False)

                self.algo.acmodel.reset(dones=[True])
                self.save_videos(self.curriculum_step, self.algo.acmodel, save_name='withTeacher_video_stoch',
                               num_rollouts=5,
                               teacher_dict=self.teacher_train_dict,
                               save_video=should_save_video, log_prefix="VidRollout/Stoch", stochastic=True)
                self.algo.acmodel.reset(dones=[True])
                self.save_videos(self.curriculum_step, self.algo.acmodel, save_name='withTeacher_video_det',
                                 num_rollouts=5,
                                   teacher_dict=self.no_teacher_dict,
                                 save_video=should_save_video, log_prefix="VidRollout/Det", stochastic=False)
                rollout_time = time.time() - time_rollout_start
            else:
                rollout_time = 0

            params = self.get_itr_snapshot(itr)
            step = self.curriculum_step

            if self.log_and_save:
                if (itr % self.save_every == 0) or (itr == self.n_itr - 1) or advance_curriculum:
                    logger.log("Saving snapshot...")
                    logger.save_itr_params(itr, step, params)
                    logger.log("Saved")

            if advance_curriculum and self.advance_levels:
                self.curriculum_step += 1
                self.sampler.advance_curriculum()
                self.algo.advance_curriculum()
                # self.algo.set_optimizer()
                itrs_on_level = 0


        logger.log("Training finished")
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            # uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))]
            # sess.run(tf.variables_initializer(uninit_vars))
            sess.run(tf.global_variables_initializer())

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)

                time_env_sampling_start = time.time()

                if itr == 0:
                    logger.log(
                        "Obtaining random samples from the environment...")
                    self.env_sampler.total_samples *= self.num_rollouts_per_iter
                    env_paths = self.env_sampler.obtain_samples(
                        log=True,
                        random=self.initial_random_samples,
                        log_prefix='Data-EnvSampler-',
                        verbose=True)
                    self.env_sampler.total_samples /= self.num_rollouts_per_iter

                time_env_samp_proc = time.time()
                samples_data = self.dynamics_sample_processor.process_samples(
                    env_paths, log=True, log_prefix='Data-EnvTrajs-')
                self.env.log_diagnostics(env_paths, prefix='Data-EnvTrajs-')
                logger.record_tabular('Data-TimeEnvSampleProc',
                                      time.time() - time_env_samp_proc)

                buffer = samples_data if self.sample_from_buffer else None
                ''' --------------- fit dynamics model --------------- '''
                logger.log("Training dynamics model for %i epochs ..." %
                           self.dynamics_model_max_epochs)
                time_fit_start = time.time()
                self.dynamics_model.fit(samples_data['observations'],
                                        samples_data['actions'],
                                        samples_data['next_observations'],
                                        epochs=self.dynamics_model_max_epochs,
                                        verbose=False,
                                        log_tabular=True,
                                        prefix='Model-')

                logger.record_tabular('Model-TimeModelFit',
                                      time.time() - time_fit_start)

                env_paths = []
                for id_rollout in range(self.num_rollouts_per_iter):
                    times_dyn_sampling = []
                    times_dyn_sample_processing = []
                    times_optimization = []
                    times_step = []

                    grad_steps_per_rollout = self.grad_steps_per_rollout
                    for step in range(grad_steps_per_rollout):

                        # logger.log("\n ---------------- Grad-Step %d ----------------" % int(grad_steps_per_rollout*itr*self.num_rollouts_per_iter,
                        #                                                             + id_rollout * grad_steps_per_rollout + step))
                        step_start_time = time.time()
                        """ -------------------- Sampling --------------------------"""

                        logger.log("Obtaining samples from the model...")
                        time_env_sampling_start = time.time()
                        paths = self.model_sampler.obtain_samples(
                            log=True, log_prefix='Policy-', buffer=buffer)
                        sampling_time = time.time() - time_env_sampling_start
                        """ ----------------- Processing Samples ---------------------"""

                        logger.log("Processing samples from the model...")
                        time_proc_samples_start = time.time()
                        samples_data = self.model_sample_processor.process_samples(
                            paths, log='all', log_prefix='Policy-')
                        proc_samples_time = time.time(
                        ) - time_proc_samples_start

                        if type(paths) is list:
                            self.log_diagnostics(paths, prefix='Policy-')
                        else:
                            self.log_diagnostics(sum(paths.values(), []),
                                                 prefix='Policy-')
                        """ ------------------ Policy Update ---------------------"""

                        logger.log("Optimizing policy...")
                        time_optimization_step_start = time.time()
                        self.algo.optimize_policy(samples_data)
                        optimization_time = time.time(
                        ) - time_optimization_step_start

                        times_dyn_sampling.append(sampling_time)
                        times_dyn_sample_processing.append(proc_samples_time)
                        times_optimization.append(optimization_time)
                        times_step.append(time.time() - step_start_time)

                    logger.log(
                        "Obtaining random samples from the environment...")
                    env_paths.extend(
                        self.env_sampler.obtain_samples(
                            log=True,
                            log_prefix='Data-EnvSampler-',
                            verbose=True))

                logger.record_tabular('Data-TimeEnvSampling',
                                      time.time() - time_env_sampling_start)
                logger.log("Processing environment samples...")
                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Iteration', itr)
                logger.logkv('n_timesteps',
                             self.env_sampler.total_timesteps_sampled)
                logger.logkv('Policy-TimeSampleProc',
                             np.sum(times_dyn_sample_processing))
                logger.logkv('Policy-TimeSampling', np.sum(times_dyn_sampling))
                logger.logkv('Policy-TimeAlgoOpt', np.sum(times_optimization))
                logger.logkv('Policy-TimeStep', np.sum(times_step))

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()
                if itr == 0:
                    sess.graph.finalize()

            logger.logkv('Trainer-TimeTotal', time.time() - start_time)

        logger.log("Training finished")
        self.sess.close()
Exemplo n.º 3
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)

                time_env_sampling_start = time.time()

                logger.log(
                    "Obtaining samples from the environment using the policy..."
                )
                env_paths = self.sampler.obtain_samples(log=True,
                                                        log_prefix='')

                logger.record_tabular('Time-EnvSampling',
                                      time.time() - time_env_sampling_start)
                logger.log("Processing environment samples...")

                # first processing just for logging purposes
                time_env_samp_proc = time.time()
                samples_data = self.sample_processor.process_samples(
                    env_paths, log=True, log_prefix='EnvTrajs-')

                logger.record_tabular('Time-EnvSampleProc',
                                      time.time() - time_env_samp_proc)
                ''' --------------- fit dynamics model --------------- '''

                time_fit_start = time.time()

                logger.log("Training dynamics model for %i epochs ..." %
                           self.dynamics_model_max_epochs)
                self.dynamics_model.fit(samples_data['observations'],
                                        samples_data['actions'],
                                        samples_data['next_observations'],
                                        epochs=self.dynamics_model_max_epochs,
                                        verbose=False,
                                        log_tabular=True,
                                        early_stopping=True,
                                        compute_normalization=False)

                logger.log("Training the value function for %i epochs ..." %
                           self.vfun_max_epochs)
                self.value_function.fit(samples_data['observations'],
                                        samples_data['returns'],
                                        epochs=self.vfun_max_epochs,
                                        verbose=False,
                                        log_tabular=True,
                                        compute_normalization=False)

                logger.log("Training the policy ...")
                self.algo.optimize_policy(samples_data)

                logger.record_tabular('Time-ModelFit',
                                      time.time() - time_fit_start)
                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps',
                             self.sampler.total_timesteps_sampled)

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                self.log_diagnostics(env_paths, '')
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()
                if itr == 0:
                    sess.graph.finalize()

        logger.log("Training finished")
        self.sess.close()
Exemplo n.º 4
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            # uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))]
            # sess.run(tf.variables_initializer(uninit_vars))
            sess.run(tf.global_variables_initializer())

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)

                time_env_sampling_start = time.time()

                if self.initial_random_samples and itr == 0:
                    logger.log(
                        "Obtaining random samples from the environment...")
                    env_paths = self.env_sampler.obtain_samples(
                        log=True, random=True, log_prefix='Data-EnvSampler-')

                else:
                    logger.log(
                        "Obtaining samples from the environment using the policy..."
                    )
                    env_paths = self.env_sampler.obtain_samples(
                        log=True, log_prefix='Data-EnvSampler-')

                # Add sleeping time to match parallel experiment
                # time.sleep(10)

                logger.record_tabular('Data-TimeEnvSampling',
                                      time.time() - time_env_sampling_start)
                logger.log("Processing environment samples...")

                # first processing just for logging purposes
                time_env_samp_proc = time.time()

                samples_data = self.dynamics_sample_processor.process_samples(
                    env_paths, log=True, log_prefix='Data-EnvTrajs-')

                self.env.log_diagnostics(env_paths, prefix='Data-EnvTrajs-')

                logger.record_tabular('Data-TimeEnvSampleProc',
                                      time.time() - time_env_samp_proc)
                ''' --------------- fit dynamics model --------------- '''

                time_fit_start = time.time()

                self.dynamics_model.update_buffer(
                    samples_data['observations'],
                    samples_data['actions'],
                    samples_data['next_observations'],
                    check_init=True)

                buffer = None if not self.sample_from_buffer else samples_data

                logger.record_tabular('Model-TimeModelFit',
                                      time.time() - time_fit_start)
                ''' --------------- MAML steps --------------- '''
                times_dyn_sampling = []
                times_dyn_sample_processing = []
                times_optimization = []
                times_step = []
                remaining_model_idx = list(
                    range(self.dynamics_model.num_models))
                valid_loss_rolling_average_prev = None

                with_new_data = True
                for id_step in range(self.repeat_steps):

                    for epoch in range(self.num_epochs_per_step):
                        logger.log(
                            "Training dynamics model for %i epochs ..." % 1)
                        remaining_model_idx, valid_loss_rolling_average = self.dynamics_model.fit_one_epoch(
                            remaining_model_idx,
                            valid_loss_rolling_average_prev,
                            with_new_data,
                            log_tabular=True,
                            prefix='Model-')
                        with_new_data = False

                    for step in range(self.num_grad_policy_per_step):

                        logger.log(
                            "\n ---------------- Grad-Step %d ----------------"
                            % int(itr * self.repeat_steps *
                                  self.num_grad_policy_per_step + id_step *
                                  self.num_grad_policy_per_step + step))
                        step_start_time = time.time()
                        """ -------------------- Sampling --------------------------"""

                        logger.log("Obtaining samples from the model...")
                        time_env_sampling_start = time.time()
                        paths = self.model_sampler.obtain_samples(
                            log=True, log_prefix='Policy-', buffer=buffer)
                        sampling_time = time.time() - time_env_sampling_start
                        """ ----------------- Processing Samples ---------------------"""

                        logger.log("Processing samples from the model...")
                        time_proc_samples_start = time.time()
                        samples_data = self.model_sample_processor.process_samples(
                            paths, log='all', log_prefix='Policy-')
                        proc_samples_time = time.time(
                        ) - time_proc_samples_start

                        if type(paths) is list:
                            self.log_diagnostics(paths, prefix='Policy-')
                        else:
                            self.log_diagnostics(sum(paths.values(), []),
                                                 prefix='Policy-')
                        """ ------------------ Policy Update ---------------------"""

                        logger.log("Optimizing policy...")
                        # This needs to take all samples_data so that it can construct graph for meta-optimization.
                        time_optimization_step_start = time.time()
                        self.algo.optimize_policy(samples_data)
                        optimization_time = time.time(
                        ) - time_optimization_step_start

                        times_dyn_sampling.append(sampling_time)
                        times_dyn_sample_processing.append(proc_samples_time)
                        times_optimization.append(optimization_time)
                        times_step.append(time.time() - step_start_time)
                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Iteration', itr)
                logger.logkv('n_timesteps',
                             self.env_sampler.total_timesteps_sampled)
                logger.logkv('Policy-TimeSampleProc',
                             np.sum(times_dyn_sample_processing))
                logger.logkv('Policy-TimeSampling', np.sum(times_dyn_sampling))
                logger.logkv('Policy-TimeAlgoOpt', np.sum(times_optimization))
                logger.logkv('Policy-TimeStep', np.sum(times_step))

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()
                if itr == 0:
                    sess.graph.finalize()

            logger.logkv('Trainer-TimeTotal', time.time() - start_time)

        logger.log("Training finished")
        self.sess.close()
Exemplo n.º 5
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            # uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))]
            # sess.run(tf.variables_initializer(uninit_vars))
            sess.run(tf.global_variables_initializer())

            if type(self.meta_steps_per_iter) is tuple:
                meta_steps_per_iter = np.linspace(self.meta_steps_per_iter[0],
                                                  self.meta_steps_per_iter[1],
                                                  self.n_itr).astype(np.int)
            else:
                meta_steps_per_iter = [self.meta_steps_per_iter] * self.n_itr
            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)

                time_env_sampling_start = time.time()

                if self.initial_random_samples and itr == 0:
                    logger.log(
                        "Obtaining random samples from the environment...")
                    env_paths = self.env_sampler.obtain_samples(
                        log=True, random=True, log_prefix='EnvSampler-')

                else:
                    logger.log(
                        "Obtaining samples from the environment using the policy..."
                    )
                    env_paths = self.env_sampler.obtain_samples(
                        log=True, log_prefix='EnvSampler-')

                logger.record_tabular('Time-EnvSampling',
                                      time.time() - time_env_sampling_start)
                logger.log("Processing environment samples...")

                # first processing just for logging purposes
                time_env_samp_proc = time.time()
                if type(env_paths) is dict or type(
                        env_paths) is collections.OrderedDict:
                    env_paths = list(env_paths.values())
                    idxs = np.random.choice(range(len(env_paths)),
                                            size=self.num_rollouts_per_iter,
                                            replace=False)
                    env_paths = sum([env_paths[idx] for idx in idxs], [])

                elif type(env_paths) is list:
                    idxs = np.random.choice(range(len(env_paths)),
                                            size=self.num_rollouts_per_iter,
                                            replace=False)
                    env_paths = [env_paths[idx] for idx in idxs]

                else:
                    raise TypeError
                samples_data = self.dynamics_sample_processor.process_samples(
                    env_paths, log=True, log_prefix='EnvTrajs-')

                self.env.log_diagnostics(env_paths, prefix='EnvTrajs-')

                logger.record_tabular('Time-EnvSampleProc',
                                      time.time() - time_env_samp_proc)

                logger.record_tabular('Time-Data',
                                      time.time() - time_env_sampling_start)
                ''' --------------- fit dynamics model --------------- '''

                time_fit_start = time.time()

                logger.log("Training dynamics model for %i epochs ..." %
                           (self.dynamics_model_max_epochs))
                self.dynamics_model.fit(samples_data['observations'],
                                        samples_data['actions'],
                                        samples_data['next_observations'],
                                        epochs=self.dynamics_model_max_epochs,
                                        verbose=True,
                                        log_tabular=True)

                buffer = None if not self.sample_from_buffer else samples_data

                logger.record_tabular('Time-ModelFit',
                                      time.time() - time_fit_start)
                ''' ------------ log real performance --------------- '''

                if self.log_real_performance:
                    logger.log("Evaluating the performance of the real policy")
                    self.policy.switch_to_pre_update()
                    env_paths = self.env_sampler.obtain_samples(
                        log=True, log_prefix='PrePolicy-')
                    samples_data = self.model_sample_processor.process_samples(
                        env_paths, log='all', log_prefix='PrePolicy-')
                    self.algo._adapt(samples_data)
                    env_paths = self.env_sampler.obtain_samples(
                        log=True, log_prefix='PostPolicy-')
                    self.model_sample_processor.process_samples(
                        env_paths, log='all', log_prefix='PostPolicy-')
                ''' --------------- MAML steps --------------- '''

                times_dyn_sampling = []
                times_dyn_sample_processing = []
                times_meta_sampling = []
                times_inner_step = []
                times_total_inner_step = []
                times_outer_step = []
                times_maml_steps = []

                for meta_itr in range(meta_steps_per_iter[itr]):

                    logger.log(
                        "\n ---------------- Meta-Step %d ----------------" %
                        int(sum(meta_steps_per_iter[:itr]) + meta_itr))
                    self.policy.switch_to_pre_update(
                    )  # Switch to pre-update policy

                    all_samples_data, all_paths = [], []
                    list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], []
                    time_maml_steps_start = time.time()
                    start_total_inner_time = time.time()
                    for step in range(self.num_inner_grad_steps + 1):
                        logger.log("\n ** Adaptation-Step %d **" % step)
                        """ -------------------- Sampling --------------------------"""

                        logger.log("Obtaining samples...")
                        time_env_sampling_start = time.time()
                        paths = self.model_sampler.obtain_samples(
                            log=True,
                            log_prefix='Step_%d-' % step,
                            buffer=buffer)
                        list_sampling_time.append(time.time() -
                                                  time_env_sampling_start)
                        all_paths.append(paths)
                        """ ----------------- Processing Samples ---------------------"""

                        logger.log("Processing samples...")
                        time_proc_samples_start = time.time()
                        samples_data = self.model_sample_processor.process_samples(
                            paths, log='all', log_prefix='Step_%d-' % step)
                        all_samples_data.append(samples_data)
                        list_proc_samples_time.append(time.time() -
                                                      time_proc_samples_start)

                        self.log_diagnostics(sum(list(paths.values()), []),
                                             prefix='Step_%d-' % step)
                        """ ------------------- Inner Policy Update --------------------"""

                        time_inner_step_start = time.time()
                        if step < self.num_inner_grad_steps:
                            logger.log("Computing inner policy updates...")
                            self.algo._adapt(samples_data)

                        list_inner_step_time.append(time.time() -
                                                    time_inner_step_start)
                    total_inner_time = time.time() - start_total_inner_time

                    time_maml_opt_start = time.time()
                    """ ------------------ Outer Policy Update ---------------------"""

                    logger.log("Optimizing policy...")
                    # This needs to take all samples_data so that it can construct graph for meta-optimization.
                    time_outer_step_start = time.time()
                    self.algo.optimize_policy(all_samples_data)

                    times_inner_step.append(list_inner_step_time)
                    times_total_inner_step.append(total_inner_time)
                    times_outer_step.append(time.time() -
                                            time_outer_step_start)
                    times_meta_sampling.append(np.sum(list_sampling_time))
                    times_dyn_sampling.append(list_sampling_time)
                    times_dyn_sample_processing.append(list_proc_samples_time)
                    times_maml_steps.append(time.time() -
                                            time_maml_steps_start)
                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Itr', itr)
                if self.log_real_performance:
                    logger.logkv(
                        'n_timesteps',
                        self.env_sampler.total_timesteps_sampled /
                        (3 * self.policy.meta_batch_size) *
                        self.num_rollouts_per_iter)
                else:
                    logger.logkv(
                        'n_timesteps',
                        self.env_sampler.total_timesteps_sampled /
                        self.policy.meta_batch_size *
                        self.num_rollouts_per_iter)
                logger.logkv('AvgTime-OuterStep', np.mean(times_outer_step))
                logger.logkv('AvgTime-InnerStep', np.mean(times_inner_step))
                logger.logkv('AvgTime-TotalInner',
                             np.mean(times_total_inner_step))
                logger.logkv('AvgTime-InnerStep', np.mean(times_inner_step))
                logger.logkv('AvgTime-SampleProc',
                             np.mean(times_dyn_sample_processing))
                logger.logkv('AvgTime-Sampling', np.mean(times_dyn_sampling))
                logger.logkv('AvgTime-MAMLSteps', np.mean(times_maml_steps))

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()
                if itr == 0:
                    sess.graph.finalize()

        logger.log("Training finished")
        self.sess.close()
Exemplo n.º 6
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                self.sampler.update_tasks()
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)
                logger.log(
                    "Sampling set of tasks/goals for this meta-batch...")
                """ -------------------- Sampling --------------------------"""

                logger.log("Obtaining samples...")
                time_env_sampling_start = time.time()
                paths = self.sampler.obtain_samples(log=True,
                                                    log_prefix='train-')
                sampling_time = time.time() - time_env_sampling_start
                """ ----------------- Processing Samples ---------------------"""

                logger.log("Processing samples...")
                time_proc_samples_start = time.time()
                samples_data = self.sample_processor.process_samples(
                    paths, log='all', log_prefix='train-')
                proc_samples_time = time.time() - time_proc_samples_start

                if type(paths) is list:
                    self.log_diagnostics(paths, prefix='train-')
                else:
                    self.log_diagnostics(sum(paths.values(), []),
                                         prefix='train-')
                """ ------------------ Policy Update ---------------------"""

                logger.log("Optimizing policy...")
                # This needs to take all samples_data so that it can construct graph for meta-optimization.
                time_optimization_step_start = time.time()
                self.algo.optimize_policy(samples_data)
                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps',
                             self.sampler.total_timesteps_sampled)

                logger.logkv('Time-Optimization',
                             time.time() - time_optimization_step_start)
                logger.logkv('Time-SampleProc', np.sum(proc_samples_time))
                logger.logkv('Time-Sampling', sampling_time)

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()
                if itr == 0:
                    sess.graph.finalize()

        logger.log("Training finished")
        self.sess.close()
Exemplo n.º 7
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)
                logger.log(
                    "Sampling set of tasks/goals for this meta-batch...")

                self.sampler.update_tasks()
                self.policy.switch_to_pre_update(
                )  # Switch to pre-update policy

                all_samples_data, all_paths = [], []
                list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], []
                start_total_inner_time = time.time()
                for step in range(self.num_inner_grad_steps + 1):
                    logger.log('** Step ' + str(step) + ' **')
                    """ -------------------- Sampling --------------------------"""

                    logger.log("Obtaining samples...")
                    time_env_sampling_start = time.time()
                    paths = self.sampler.obtain_samples(log=True,
                                                        log_prefix='Step_%d-' %
                                                        step)
                    list_sampling_time.append(time.time() -
                                              time_env_sampling_start)
                    all_paths.append(paths)
                    """ ----------------- Processing Samples ---------------------"""

                    logger.log("Processing samples...")
                    time_proc_samples_start = time.time()
                    samples_data = self.sample_processor.process_samples(
                        paths, log='all', log_prefix='Step_%d-' % step)
                    all_samples_data.append(samples_data)
                    list_proc_samples_time.append(time.time() -
                                                  time_proc_samples_start)

                    self.log_diagnostics(sum(list(paths.values()), []),
                                         prefix='Step_%d-' % step)
                    """ ------------------- Inner Policy Update --------------------"""

                    time_inner_step_start = time.time()
                    if step < self.num_inner_grad_steps:
                        logger.log("Computing inner policy updates...")
                        self.algo._adapt(samples_data)
                    # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/maml_zoo_graph',
                    #                                      sess.graph)
                    list_inner_step_time.append(time.time() -
                                                time_inner_step_start)
                total_inner_time = time.time() - start_total_inner_time

                time_maml_opt_start = time.time()
                """ ------------------ Outer Policy Update ---------------------"""

                logger.log("Optimizing policy...")
                # This needs to take all samples_data so that it can construct graph for meta-optimization.
                time_outer_step_start = time.time()
                self.algo.optimize_policy(all_samples_data)
                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps',
                             self.sampler.total_timesteps_sampled)

                logger.logkv('Time-OuterStep',
                             time.time() - time_outer_step_start)
                logger.logkv('Time-TotalInner', total_inner_time)
                logger.logkv('Time-InnerStep', np.sum(list_inner_step_time))
                logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time))
                logger.logkv('Time-Sampling', np.sum(list_sampling_time))

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)
                logger.logkv('Time-MAMLSteps',
                             time.time() - time_maml_opt_start)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()
                if itr == 0:
                    sess.graph.finalize()

        logger.log("Training finished")
        self.sess.close()
Exemplo n.º 8
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))

            if type(self.steps_per_iter) is tuple:
                steps_per_iter = np.linspace(self.steps_per_iter[0],
                                             self.steps_per_iter[1],
                                             self.n_itr).astype(np.int)
            else:
                steps_per_iter = [self.steps_per_iter] * self.n_itr

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)

                time_env_sampling_start = time.time()

                if self.initial_random_samples and itr == 0:
                    logger.log(
                        "Obtaining random samples from the environment...")
                    env_paths = self.env_sampler.obtain_samples(
                        log=True, random=True, log_prefix='EnvSampler-')

                else:
                    logger.log(
                        "Obtaining samples from the environment using the policy..."
                    )
                    env_paths = self.env_sampler.obtain_samples(
                        log=True, log_prefix='EnvSampler-')
                    self.policy.obs_filter.stats_increment()

                logger.record_tabular('Time-EnvSampling',
                                      time.time() - time_env_sampling_start)
                logger.log("Processing environment samples...")

                # first processing just for logging purposes
                time_env_samp_proc = time.time()
                samples_data = self.dynamics_sample_processor.process_samples(
                    env_paths, log=True, log_prefix='EnvTrajs-')

                self.env.log_diagnostics(env_paths, prefix='EnvTrajs-')

                logger.record_tabular('Time-EnvSampleProc',
                                      time.time() - time_env_samp_proc)
                ''' --------------- fit dynamics model --------------- '''

                time_fit_start = time.time()

                logger.log("Training dynamics model for %i epochs ..." %
                           self.dynamics_model_max_epochs)
                if self.dynamics_model is not None:
                    self.dynamics_model.fit(
                        samples_data['observations'],
                        samples_data['actions'],
                        samples_data['next_observations'],
                        epochs=self.dynamics_model_max_epochs,
                        verbose=False,
                        log_tabular=True,
                        compute_normalization=True)

                buffer = None if not self.sample_from_buffer else samples_data

                logger.record_tabular('Time-ModelFit',
                                      time.time() - time_fit_start)

                # returns = np.mean(samples_data['returns'])
                # if returns < self._last_returns:
                #     self.policy.set_params(self._prev_policy)
                #     self._last_returns = returns
                # self._prev_policy = self.policy.get_params()
                ''' ------------ log real performance --------------- '''

                # if self.log_real_performance:
                #     logger.log("Evaluating the performance of the real policy")
                #     env_paths = self.env_sampler.obtain_samples(log=True, log_prefix='RealPolicy-')
                #     _ = self.model_sample_processor.process_samples(env_paths, log='all', log_prefix='PrePolicy-')
                ''' --------------- RS steps --------------- '''

                times_dyn_sampling = []
                times_dyn_sample_processing = []
                times_itr = []
                times_rs_steps = []
                list_sampling_time = []
                list_proc_samples_time = []
                for rs_itr in range(steps_per_iter[itr]):
                    time_itr_start = time.time()
                    logger.log("\n -------------- RS-Step %d --------------" %
                               int(sum(steps_per_iter[:itr]) + rs_itr))
                    deltas = self.policy.get_deltas(self.num_deltas)
                    self.policy.set_deltas(deltas,
                                           delta_std=self.delta_std,
                                           symmetrical=True)
                    """ -------------------- Sampling --------------------------"""
                    logger.log("Obtaining samples...")
                    time_env_sampling_start = time.time()
                    samples_data = self.model_sampler.obtain_samples(
                        log=True, log_prefix='Models-', buffer=buffer)
                    list_sampling_time.append(time.time() -
                                              time_env_sampling_start)
                    """ ---------------------- Processing --------------------- """
                    # TODO: Add preprocessing of the state to see what sort of update rule between the models we want
                    samples_data = self.ars_sample_processor.process_samples(
                        samples_data, log=True, log_prefix='step%d-' % rs_itr)

                    if self.dynamics_model is None:
                        self.policy.stats_increment()
                    """ ------------------ Outer Policy Update ---------------------"""
                    logger.log("Optimizing policy...")
                    # This needs to take all samples_data so that it can construct graph for meta-optimization.
                    time_rs_start = time.time()
                    self.algo.optimize_policy(samples_data['returns'], deltas)

                    times_dyn_sampling.append(list_sampling_time)
                    times_dyn_sample_processing.append(list_proc_samples_time)
                    times_rs_steps.append(time.time() - time_rs_start)
                    times_itr.append(time.time() - time_itr_start)
                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Itr', itr)
                if self.dynamics_model is None:
                    logger.logkv('n_timesteps',
                                 self.model_sampler.total_timesteps_sampled)
                else:
                    logger.logkv('n_timesteps',
                                 self.env_sampler.total_timesteps_sampled)

                logger.logkv('AvgTime-RS', np.mean(times_rs_steps))
                logger.logkv('AvgTime-SampleProc',
                             np.mean(times_dyn_sample_processing))
                logger.logkv('AvgTime-Sampling', np.mean(times_dyn_sampling))
                logger.logkv('AvgTime-ModelItr', np.mean(times_itr))

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()
                if itr == 0:
                    sess.graph.finalize()

        logger.log("Training finished")
        self.sess.close()
Exemplo n.º 9
0
    def train(self):
        """
        Trains policy on env using algo
        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """

        with self.sess.as_default() as sess:
            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))
            start_time = time.time()

            if self.start_itr == 0:
                self.algo._update_target(tau=1.0)
                if self.n_initial_exploration_steps > 0:
                    while self.replay_buffer._size < self.n_initial_exploration_steps:
                        paths = self.sampler.obtain_samples(
                            log=True, log_prefix='train-', random=True)
                        samples_data = self.sample_processor.process_samples(
                            paths, log='all', log_prefix='train-')
                        sample_num = samples_data['observations'].shape[0]
                        for i in range(sample_num):
                            self.replay_buffer.add_sample(
                                samples_data['observations'][i],
                                samples_data['actions'][i],
                                samples_data['rewards'][i],
                                samples_data['dones'][i],
                                samples_data['next_observations'][i],
                            )

            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)
                logger.log(
                    "Sampling set of tasks/goals for this meta-batch...")
                """ -------------------- Sampling --------------------------"""

                logger.log("Obtaining samples...")
                time_env_sampling_start = time.time()
                paths = self.sampler.obtain_samples(log=True,
                                                    log_prefix='train-')
                sampling_time = time.time() - time_env_sampling_start
                """ ----------------- Processing Samples ---------------------"""
                # check how the samples are processed
                logger.log("Processing samples...")
                time_proc_samples_start = time.time()
                samples_data = self.sample_processor.process_samples(
                    paths, log='all', log_prefix='train-')
                sample_num = samples_data['observations'].shape[0]
                for i in range(sample_num):
                    self.replay_buffer.add_sample(
                        samples_data['observations'][i],
                        samples_data['actions'][i], samples_data['rewards'][i],
                        samples_data['dones'][i],
                        samples_data['next_observations'][i])
                proc_samples_time = time.time() - time_proc_samples_start

                paths = self.sampler.obtain_samples(log=True,
                                                    log_prefix='eval-',
                                                    deterministic=True)
                _ = self.sample_processor.process_samples(paths,
                                                          log='all',
                                                          log_prefix='eval-')

                self.log_diagnostics(paths, prefix='train-')
                """ ------------------ Policy Update ---------------------"""

                logger.log("Optimizing policy...")

                time_optimization_step_start = time.time()
                self.algo.optimize_policy(self.replay_buffer,
                                          itr - self.start_itr,
                                          self.num_grad_steps)
                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps',
                             self.sampler.total_timesteps_sampled)

                logger.logkv('Time-Optimization',
                             time.time() - time_optimization_step_start)
                logger.logkv('Time-SampleProc', np.sum(proc_samples_time))
                logger.logkv('Time-Sampling', sampling_time)

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()
                if itr == 0:
                    sess.graph.finalize()

        logger.log("Training finished")
        self.sess.close()