예제 #1
0
    def train(self):

        for i in range(1, self.eff+1):

            with self.sess.as_default() as sess:

                logger.log("----------- Adaptation rollouts per meta-task = ", i, " -----------")
                # self.sampler.rollouts_per_meta_task = 10000
                self.sampler.update_batch_size(i)

                # initialize uninitialized vars  (only initialize vars that were not loaded)
                uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))]
                sess.run(tf.variables_initializer(uninit_vars))

                self.task = self.env.sample_tasks(self.sampler.meta_batch_size, is_eval=True)
                self.sampler.set_tasks(self.task)

                #logger.log("\n ---------------- Iteration %d ----------------" % itr)
                logger.log("Sampling set of tasks/goals for this meta-batch...")

                """ -------------------- Sampling --------------------------"""

                logger.log("Obtaining samples...")
                paths = self.sampler.obtain_samples(log=True, log_prefix='train-')

                """ ----------------- Processing Samples ---------------------"""

                logger.log("Processing samples...")
                samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='train-')
                self.log_diagnostics(sum(paths.values(), []), prefix='train-')

                #""" ------------------ Policy Update ---------------------"""

                #logger.log("Optimizing policy...")
                ## This needs to take all samples_data so that it can construct graph for meta-optimization.
                #time_optimization_step_start = time.time()
                #self.algo.optimize_policy(samples_data)

                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled)

                #logger.log("Saving snapshot...")
                #params = self.get_itr_snapshot(itr)
                #logger.save_itr_params(itr, params)
                #logger.log("Saved")

                logger.dumpkvs()
                # if itr == 0:
                    # sess.graph.finalize()

            logger.log("Training finished")
        self.sess.close()
예제 #2
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                self.task = self.env.sample_tasks(self.sampler.meta_batch_size)
                self.sampler.set_tasks(self.task)
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)
                logger.log(
                    "Sampling set of tasks/goals for this meta-batch...")
                """ -------------------- Sampling --------------------------"""

                logger.log("Obtaining samples...")
                time_env_sampling_start = time.time()
                paths = self.sampler.obtain_samples(log=True,
                                                    log_prefix='train-')
                sampling_time = time.time() - time_env_sampling_start
                """ ----------------- Processing Samples ---------------------"""

                logger.log("Processing samples...")
                time_proc_samples_start = time.time()
                samples_data = self.sample_processor.process_samples(
                    paths, log='all', log_prefix='train-')
                proc_samples_time = time.time() - time_proc_samples_start

                self.log_diagnostics(sum(paths.values(), []), prefix='train-')
                """ ------------------ Policy Update ---------------------"""

                logger.log("Optimizing policy...")
                # This needs to take all samples_data so that it can construct graph for meta-optimization.
                time_optimization_step_start = time.time()
                self.algo.optimize_policy(samples_data)
                """ ------------------ Test-split Performance for logging ---------------------"""

                logger.log("Testing on test-tasks split for logging...")

                sampler_batch_size = self.sampler.batch_size
                self.sampler.update_batch_size(3)  ####################2

                undiscounted_returns = []
                for i in range(0, self.env.NUM_EVAL,
                               self.sampler.meta_batch_size):
                    # Caution: Here actually i in [0] since self.meta_batch_size=100(when running on linux)

                    self.sampler.update_tasks(
                        test=True, start_from=i)  # sample from test split!
                    #self.policy.switch_to_pre_update()  # Switch to pre-update policy

                    logger.log("On Test: Obtaining samples...")
                    paths = self.sampler.obtain_samples(
                        log=False,
                        test=True)  # log_prefix='test-Step_%d-' % step

                    logger.log("On Test: Processing Samples...")
                    self.log_diagnostics(sum(list(paths.values()), []),
                                         prefix='test-')
                    """ ------------------- Logging Returns --------------------"""
                    paths = self.sample_processor.gao_paths(paths)
                    undiscounted_returns.extend(
                        [sum(path["rewards"]) for path in paths])

                test_average_return = np.mean(undiscounted_returns)
                self.sampler.update_batch_size(sampler_batch_size)
                """ ------------------- Logging Stuff --------------------------"""

                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps',
                             self.sampler.total_timesteps_sampled)

                logger.logkv('test-AverageReturn', test_average_return)

                logger.logkv('Time-Optimization',
                             time.time() - time_optimization_step_start)
                logger.logkv('Time-SampleProc', np.sum(proc_samples_time))
                logger.logkv('Time-Sampling', sampling_time)

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()
                if itr == 0:
                    sess.graph.finalize()

        logger.log("Training finished")
        self.sess.close()
예제 #3
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode::
        
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))]
            sess.run(tf.variables_initializer(uninit_vars))

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log("\n ---------------- Iteration %d ----------------" % itr)
                logger.log("Sampling set of tasks/goals for this meta-batch...")

                #self.sampler.update_tasks()
                self.policy.switch_to_pre_update()  # Switch to pre-update policy

                all_samples_data, all_paths = [], []
                list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], []
                start_total_inner_time = time.time()
                for step in range(self.num_inner_grad_steps+1):
                    logger.log('** Step ' + str(step) + ' **')

                    """ -------------------- Sampling --------------------------"""

                    logger.log("Obtaining samples...")
                    time_env_sampling_start = time.time()
                    paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step)
                    list_sampling_time.append(time.time() - time_env_sampling_start)
                    all_paths.append(paths)

                    """ ----------------- Processing Samples ---------------------"""

                    logger.log("Processing samples...")
                    time_proc_samples_start = time.time()
                    samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='Step_%d-' % step)
                    all_samples_data.append(samples_data)
                    list_proc_samples_time.append(time.time() - time_proc_samples_start)

                    self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step)

                    """ ------------------- Inner Policy Update --------------------"""

                    time_inner_step_start = time.time()
                    if step < self.num_inner_grad_steps:
                        logger.log("Computing inner policy updates...")
                        self.algo._adapt(samples_data)
                    # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/meta_policy_search_graph',
                    #                                      sess.graph)
                    list_inner_step_time.append(time.time() - time_inner_step_start)
                total_inner_time = time.time() - start_total_inner_time

                time_maml_opt_start = time.time()
                """ ------------------ Outer Policy Update ---------------------"""

                logger.log("Optimizing policy...")
                # This needs to take all samples_data so that it can construct graph for meta-optimization.
                time_outer_step_start = time.time()
                self.algo.optimize_policy(all_samples_data)

                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled)
                #writer.add_scalar(self.algo.name, self.sample_processor.AR, self.sampler.total_timesteps_sampled)
                logger.logkv('Time-OuterStep', time.time() - time_outer_step_start)
                logger.logkv('Time-TotalInner', total_inner_time)
                logger.logkv('Time-InnerStep', np.sum(list_inner_step_time))
                logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time))
                logger.logkv('Time-Sampling', np.sum(list_sampling_time))

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)
                logger.logkv('Time-MAMLSteps', time.time() - time_maml_opt_start)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()

        logger.log("Training finished")
        self.sess.close()        
예제 #4
0
    def train(self):

        policy_0 = self.policy

        for i in [4, 3, 2, 1]:  #range(1, self.eff+1):

            print("On", i, "self.policy == policy_0: ",
                  self.policy == policy_0)

            with self.sess.as_default() as sess:

                logger.log("----------- Adaptation rollouts per meta-task = ",
                           i, " -----------")

                undiscounted_returns = []
                for j in range(0, self.env.NUM_EVAL,
                               self.sampler.meta_batch_size):

                    logger.log("---------Testing on task", j, "~",
                               j + self.sampler.meta_batch_size - 1,
                               "---------")

                    # initialize uninitialized vars  (only initialize vars that were not loaded)
                    # uninit_vars = [var for var in tf.global_variables() if
                    #                not sess.run(tf.is_variable_initialized(var))]
                    # sess.run(tf.variables_initializer(uninit_vars))

                    uninit_vars = [var for var in tf.global_variables()]
                    sess.run(tf.variables_initializer(uninit_vars))

                    logger.log(
                        "Sampling set of tasks/goals for this meta-batch...")
                    self.sampler.update_tasks(
                        test=True, start_from=j)  # sample from test split!
                    self.policy.switch_to_pre_update(
                    )  # Switch to pre-update policy

                    for step in range(self.num_inner_grad_steps + 1):

                        if step < self.num_inner_grad_steps:
                            self.sampler.update_batch_size_v2(
                                i)  ######################
                            logger.log("On step-0: Obtaining samples...")
                        else:
                            self.sampler.update_batch_size(2)
                            logger.log("On step-1: Obtaining samples...")

                        paths = self.sampler.obtain_samples(
                            log=False,
                            test=True)  # log_prefix='test-Step_%d-' % step

                        logger.log("On Test: Processing Samples...")
                        samples_data = self.sample_processor.process_samples(
                            paths, log=False
                        )  # log='all', log_prefix='test-Step_%d-' % step
                        self.log_diagnostics(sum(list(paths.values()), []),
                                             prefix='test-Step_%d-' % step)
                        """ ------------------- Inner Policy Update / logging returns --------------------"""
                        if step < self.num_inner_grad_steps:
                            logger.log(
                                "On Test: Computing inner policy updates...")
                            self.algo._adapt(samples_data)
                        else:
                            paths = self.sample_processor.gao_paths(paths)
                            undiscounted_returns.extend(
                                [sum(path["rewards"]) for path in paths])

                test_average_return = np.mean(undiscounted_returns)
                logger.logkv('x', i)
                logger.logkv('return', test_average_return)
                logger.dumpkvs()

            logger.log("------Testing rollouts per meta-task = ", i,
                       "finished------")
            '''
예제 #5
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode::
        
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))
            n_timesteps = 0

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)

                gradients = []
                for i in range(self.num_sapling_rounds):
                    logger.log("\n ----- Sampling Round %d ---" % i)

                    dry = i < self.num_sapling_rounds - 1

                    if not dry: self.sampler.update_tasks()
                    self.policy.switch_to_pre_update(
                    )  # Switch to pre-update policy

                    all_samples_data, all_paths = [], []

                    for step in range(self.num_inner_grad_steps + 1):
                        logger.log('** Step ' + str(step) + ' **')

                        logger.log("Obtaining samples...")
                        paths = self.sampler.obtain_samples(
                            log=True, log_prefix='Step_%d-' % step)
                        all_paths.append(paths)

                        logger.log("Processing samples...")
                        samples_data = self.sample_processor.process_samples(
                            paths, log='all', log_prefix='Step_%d-' % step)
                        all_samples_data.append(samples_data)

                        if not dry:
                            self.log_diagnostics(sum(list(paths.values()), []),
                                                 prefix='Step_%d-' % step)

                        if step < self.num_inner_grad_steps:
                            logger.log("Computing inner policy updates...")
                            self.algo._adapt(samples_data)
                    """ compute gradients """
                    gradients.append(
                        self.algo.compute_gradients(all_samples_data))

                    if not dry:
                        """ ------------ Compute and log gradient variance ------------"""
                        # compute variance of adaptation gradients
                        for step_id in range(self.num_inner_grad_steps):
                            meta_batch_size = len(gradients[0][0])
                            grad_std, grad_rstd = [], []
                            for task_id in range(meta_batch_size):
                                stacked_grads = np.stack([
                                    gradients[round_id][step_id][task_id] for
                                    round_id in range(self.num_sapling_rounds)
                                ],
                                                         axis=1)
                                std = np.std(stacked_grads, axis=1)
                                mean = np.abs(np.mean(stacked_grads, axis=1))
                                grad_std.append(np.mean(std))
                                grad_rstd.append(np.mean(std / mean))

                            logger.logkv('Step_%i-GradientMean', np.mean(mean))
                            logger.logkv('Step_%i-GradientStd' % step_id,
                                         np.mean(grad_std))
                            logger.logkv('Step_%i-GradientRStd' % step_id,
                                         np.mean(grad_rstd))

                        # compute variance of meta gradients
                        stacked_grads = np.stack([
                            gradients[round_id][self.num_inner_grad_steps]
                            for round_id in range(self.num_sapling_rounds)
                        ],
                                                 axis=1)
                        std = np.std(stacked_grads, axis=1)
                        mean = np.abs(np.mean(stacked_grads, axis=1))

                        meta_grad_std = np.mean(std)
                        meta_grad_rstd = np.mean(std / (mean + 1e-8))
                        meta_grad_rvar = np.mean(std**2 / (mean + 1e-8))

                        logger.logkv('Meta-GradientMean', np.mean(mean))
                        logger.logkv('Meta-GradientStd', meta_grad_std)
                        logger.logkv('Meta-GradientRStd', meta_grad_rstd)
                        logger.logkv('Meta-GradientRVariance', meta_grad_rvar)

                        # compute cosine dists
                        cosine_dists = cdist(np.transpose(stacked_grads),
                                             np.transpose(
                                                 np.mean(stacked_grads,
                                                         axis=1).reshape(
                                                             (-1, 1))),
                                             metric='cosine')
                        mean_abs_cos_dist = np.mean(np.abs(cosine_dists))
                        mean_squared_cosine_dists = np.mean(cosine_dists**2)
                        mean_squared_cosine_dists_sqrt = np.sqrt(
                            mean_squared_cosine_dists)

                        logger.logkv('Meta-GradientCosAbs', mean_abs_cos_dist)
                        logger.logkv('Meta-GradientCosVar',
                                     mean_squared_cosine_dists)
                        logger.logkv('Meta-GradientCosStd',
                                     mean_squared_cosine_dists_sqrt)
                        """ ------------------ Outer Policy Update ---------------------"""

                        logger.log("Optimizing policy...")
                        # This needs to take all samples_data so that it can construct graph for meta-optimization.
                        self.algo.optimize_policy(all_samples_data)
                        """ ------------------- Logging Stuff --------------------------"""
                        n_timesteps += (self.num_inner_grad_steps +
                                        1) * self.sampler.total_samples
                        logger.logkv('n_timesteps', n_timesteps)

                        logger.log("Saving snapshot...")
                        params = self.get_itr_snapshot(itr)  # , **kwargs)
                        logger.save_itr_params(itr, params)
                        logger.log("Saved")

                        logger.logkv('Itr', itr)
                        logger.logkv('Time', time.time() - start_time)
                        logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.dumpkvs()

        logger.log("Training finished")
        self.sess.close()
예제 #6
0
    def train(self):
        """
        Trains policy on env using algo

        Pseudocode::
        
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """
        with self.sess.as_default() as sess:

            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = [
                var for var in tf.global_variables()
                if not sess.run(tf.is_variable_initialized(var))
            ]
            sess.run(tf.variables_initializer(uninit_vars))

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)
                logger.log(
                    "Sampling set of tasks/goals for this meta-batch...")

                self.sampler.update_tasks()  # sample tasks!
                self.policy.switch_to_pre_update(
                )  # Switch to pre-update policy

                all_samples_data, all_paths = [], []
                list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], []
                start_total_inner_time = time.time()
                for step in range(self.num_inner_grad_steps + 1):

                    logger.log('** Step ' + str(step) + ' **')
                    """ -------------------- Sampling --------------------------"""

                    logger.log("Obtaining samples...")
                    time_env_sampling_start = time.time()
                    '''
                    if step == self.num_inner_grad_steps:
                        temp = self.sampler.batch_size
                        self.sampler.update_batch_size(2)
                        paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step)
                        self.sampler.update_batch_size(temp)
                    else:
                        paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step)
                    '''
                    paths = self.sampler.obtain_samples(log=True,
                                                        log_prefix='Step_%d-' %
                                                        step)

                    list_sampling_time.append(time.time() -
                                              time_env_sampling_start)
                    all_paths.append(paths)
                    """ ----------------- Processing Samples ---------------------"""

                    logger.log("Processing samples...")
                    time_proc_samples_start = time.time()
                    samples_data = self.sample_processor.process_samples(
                        paths, log='all', log_prefix='Step_%d-' % step)
                    all_samples_data.append(samples_data)
                    list_proc_samples_time.append(time.time() -
                                                  time_proc_samples_start)

                    self.log_diagnostics(sum(list(paths.values()), []),
                                         prefix='Step_%d-' % step)
                    """ ------------------- Inner Policy Update --------------------"""

                    time_inner_step_start = time.time()
                    if step < self.num_inner_grad_steps:
                        logger.log("Computing inner policy updates...")
                        self.algo._adapt(samples_data)
                    # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/meta_policy_search_graph',
                    #                                      sess.graph)
                    list_inner_step_time.append(time.time() -
                                                time_inner_step_start)
                total_inner_time = time.time() - start_total_inner_time

                time_maml_opt_start = time.time()
                """ ------------------ Outer Policy Update ---------------------"""

                logger.log("Optimizing policy...")
                # This needs to take all samples_data so that it can construct graph for meta-optimization.
                time_outer_step_start = time.time()
                self.algo.optimize_policy(all_samples_data)
                """ ------------------ Test-split Performance for logging ---------------------"""

                logger.log(
                    "Testing on test-tasks split for logging, rollout_per_task = 20..."
                )
                undiscounted_returns = []

                for i in range(0, self.env.NUM_EVAL,
                               self.sampler.meta_batch_size):
                    self.sampler.update_tasks(
                        test=True, start_from=i)  # sample from test split!
                    self.policy.switch_to_pre_update(
                    )  # Switch to pre-update policy

                    for step in range(self.num_inner_grad_steps + 1):
                        logger.log("On Test: Obtaining samples...")
                        paths = self.sampler.obtain_samples(
                            log=False,
                            test=True)  # log_prefix='test-Step_%d-' % step

                        logger.log("On Test: Processing Samples...")
                        samples_data = self.sample_processor.process_samples(
                            paths, log=False
                        )  # log='all', log_prefix='test-Step_%d-' % step
                        self.log_diagnostics(sum(list(paths.values()), []),
                                             prefix='test20-Step_%d-' % step)
                        """ ------------------- Inner Policy Update / logging returns --------------------"""
                        if step < self.num_inner_grad_steps:
                            logger.log(
                                "On Test: Computing inner policy updates...")
                            self.algo._adapt(samples_data)
                        else:
                            paths = self.sample_processor.gao_paths(paths)
                            undiscounted_returns.extend(
                                [sum(path["rewards"]) for path in paths])

                test_average_return = np.mean(undiscounted_returns)
                logger.logkv('test20-AverageReturn', test_average_return)

                logger.log(
                    "Testing on test-tasks split for logging, rollout_per_task = 2..."
                )
                sampler_batch_size = self.sampler.batch_size
                self.sampler.update_batch_size(2)  ##############
                undiscounted_returns = []

                for i in range(0, self.env.NUM_EVAL,
                               self.sampler.meta_batch_size):
                    self.sampler.update_tasks(
                        test=True, start_from=i)  # sample from test split!
                    self.policy.switch_to_pre_update(
                    )  # Switch to pre-update policy

                    for step in range(self.num_inner_grad_steps + 1):
                        logger.log("On Test: Obtaining samples...")
                        paths = self.sampler.obtain_samples(
                            log=False,
                            test=True)  # log_prefix='test-Step_%d-' % step

                        logger.log("On Test: Processing Samples...")
                        samples_data = self.sample_processor.process_samples(
                            paths, log=False
                        )  # log='all', log_prefix='test-Step_%d-' % step
                        self.log_diagnostics(sum(list(paths.values()), []),
                                             prefix='test-Step_%d-' % step)
                        """ ------------------- Inner Policy Update / logging returns --------------------"""
                        if step < self.num_inner_grad_steps:
                            logger.log(
                                "On Test: Computing inner policy updates...")
                            self.algo._adapt(samples_data)
                        else:
                            paths = self.sample_processor.gao_paths(paths)
                            undiscounted_returns.extend(
                                [sum(path["rewards"]) for path in paths])

                test_average_return = np.mean(undiscounted_returns)
                self.sampler.update_batch_size(sampler_batch_size)
                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps',
                             self.sampler.total_timesteps_sampled)

                logger.logkv('test-AverageReturn', test_average_return)

                logger.logkv('Time-OuterStep',
                             time.time() - time_outer_step_start)
                logger.logkv('Time-TotalInner', total_inner_time)
                logger.logkv('Time-InnerStep', np.sum(list_inner_step_time))
                logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time))
                logger.logkv('Time-Sampling', np.sum(list_sampling_time))

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)
                logger.logkv('Time-MAMLSteps',
                             time.time() - time_maml_opt_start)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()

        logger.log("Training finished")
        self.sess.close()
예제 #7
0
        for i in range(params['num_inner_grad_steps']):
            paths = sampler.obtain_samples(log=False)
            samples_data = sample_processor.process_samples(paths,
                                                            log=True,
                                                            log_prefix='%i_' %
                                                            i)
            env.log_diagnostics(sum(list(paths.values()), []),
                                prefix='%i_' % i)
            algo._adapt(samples_data)

        paths = sampler.obtain_samples(log=False)
        samples_data = sample_processor.process_samples(
            paths, log=True, log_prefix='%i_' % params['num_inner_grad_steps'])
        env.log_diagnostics(sum(list(paths.values()), []),
                            prefix='%i_' % params['num_inner_grad_steps'])
        logger.dumpkvs()
        images = []

        # Postupdate:
        for _ in range(args.num_trajs):
            task_i = np.random.choice(range(params['meta_batch_size']))
            env.set_task(tasks[task_i])
            print(tasks[task_i])
            obs = env.reset()
            for _ in range(params['max_path_length']):
                action, _ = policy.get_action(obs, task_i)
                obs, reward, done, _ = env.step(action)
                time.sleep(0.001)
                if done:
                    break