Пример #1
0
    def train(self):
        """Overrides super.train to synchronize global vars."""

        if hasattr(self, "optimizer") and isinstance(self.optimizer,
                                                     PolicyOptimizer):
            self.global_vars["timestep"] = self.optimizer.num_steps_sampled
            self.optimizer.local_evaluator.set_global_vars(self.global_vars)
            for ev in self.optimizer.remote_evaluators:
                ev.set_global_vars.remote(self.global_vars)
            logger.debug("updated global vars: {}".format(self.global_vars))

        if (self.config.get("observation_filter", "NoFilter") != "NoFilter"
                and hasattr(self, "local_evaluator")):
            FilterManager.synchronize(
                self.local_evaluator.filters,
                self.remote_evaluators,
                update_remote=self.config["synchronize_filters"])
            logger.debug("synchronized filters: {}".format(
                self.local_evaluator.filters))

        result = Trainable.train(self)
        if self.config["callbacks"].get("on_train_result"):
            self.config["callbacks"]["on_train_result"]({
                "agent": self,
                "result": result,
            })
        return result
Пример #2
0
    def _train(self):
        def postprocess_samples(batch):
            # Divide by the maximum of value.std() and 1e-4
            # to guard against the case where all values are equal
            value = batch["advantages"]
            standardized = (value - value.mean()) / max(1e-4, value.std())
            batch.data["advantages"] = standardized
            batch.shuffle()
            dummy = np.zeros_like(batch["advantages"])
            if not self.config["use_gae"]:
                batch.data["value_targets"] = dummy
                batch.data["vf_preds"] = dummy
        extra_fetches = self.optimizer.step(postprocess_fn=postprocess_samples)
        kl = np.array(extra_fetches["kl"]).mean(axis=1)[-1]
        total_loss = np.array(extra_fetches["total_loss"]).mean(axis=1)[-1]
        policy_loss = np.array(extra_fetches["policy_loss"]).mean(axis=1)[-1]
        vf_loss = np.array(extra_fetches["vf_loss"]).mean(axis=1)[-1]
        entropy = np.array(extra_fetches["entropy"]).mean(axis=1)[-1]

        newkl = self.local_evaluator.for_policy(lambda pi: pi.update_kl(kl))

        info = {
            "kl_divergence": kl,
            "kl_coefficient": newkl,
            "total_loss": total_loss,
            "policy_loss": policy_loss,
            "vf_loss": vf_loss,
            "entropy": entropy,
        }

        FilterManager.synchronize(
            self.local_evaluator.filters, self.remote_evaluators)
        res = collect_metrics(self.local_evaluator, self.remote_evaluators)
        res = res._replace(info=info)
        return res
Пример #3
0
 def __setstate__(self, state):
     self.episodes_so_far = state['episodes_so_far']
     self.common.policy.set_flat_weights(state['weights'])
     self.common.policy.observation_filter = state["filter"]
     FilterManager.synchronize(
         {DEFAULT_POLICY_ID: self.common.policy.observation_filter},
         self._workers)
Пример #4
0
 def __setstate__(self, state):
     self.episodes_so_far = state["episodes_so_far"]
     self.policy.set_weights(state["weights"])
     self.policy.set_filter(state["filter"])
     FilterManager.synchronize({
         "default": self.policy.get_filter()
     }, self.workers)
Пример #5
0
 def __setstate__(self, state):
     self.episodes_so_far = state["episodes_so_far"]
     self.policy.set_weights(state["weights"])
     self.policy.set_filter(state["filter"])
     FilterManager.synchronize({
         DEFAULT_POLICY_ID: self.policy.get_filter()
     }, self._workers)
Пример #6
0
 def __setstate__(self, state):
     self.episodes_so_far = state["episodes_so_far"]
     self.policy.set_weights(state["weights"])
     self.policy.set_filter(state["filter"])
     FilterManager.synchronize({
         "default": self.policy.get_filter()
     }, self.workers)
Пример #7
0
    def train(self):
        """Overrides super.train to synchronize global vars."""

        if hasattr(self, "optimizer") and isinstance(self.optimizer,
                                                     PolicyOptimizer):
            self.global_vars["timestep"] = self.optimizer.num_steps_sampled
            self.optimizer.local_evaluator.set_global_vars(self.global_vars)
            for ev in self.optimizer.remote_evaluators:
                ev.set_global_vars.remote(self.global_vars)
            logger.debug("updated global vars: {}".format(self.global_vars))

        if (self.config.get("observation_filter", "NoFilter") != "NoFilter"
                and hasattr(self, "local_evaluator")):
            FilterManager.synchronize(
                self.local_evaluator.filters,
                self.remote_evaluators,
                update_remote=self.config["synchronize_filters"])
            logger.debug("synchronized filters: {}".format(
                self.local_evaluator.filters))

        result = Trainable.train(self)
        if self.config["callbacks"].get("on_train_result"):
            self.config["callbacks"]["on_train_result"]({
                "agent": self,
                "result": result,
            })
        return result
Пример #8
0
    def _train(self):
        def postprocess_samples(batch):
            # Divide by the maximum of value.std() and 1e-4
            # to guard against the case where all values are equal
            value = batch["advantages"]
            standardized = (value - value.mean()) / max(1e-4, value.std())
            batch.data["advantages"] = standardized
            batch.shuffle()
            dummy = np.zeros_like(batch["advantages"])
            if not self.config["use_gae"]:
                batch.data["value_targets"] = dummy
                batch.data["vf_preds"] = dummy

        extra_fetches = self.optimizer.step(postprocess_fn=postprocess_samples)

        final_metrics = np.array(extra_fetches).mean(axis=1)[-1, :].tolist()
        total_loss, policy_loss, vf_loss, kl, entropy = final_metrics
        self.local_evaluator.update_kl(kl)

        info = {
            "total_loss": total_loss,
            "policy_loss": policy_loss,
            "vf_loss": vf_loss,
            "kl_divergence": kl,
            "entropy": entropy,
            "kl_coefficient": self.local_evaluator.kl_coeff_val,
        }

        FilterManager.synchronize(self.local_evaluator.filters,
                                  self.remote_evaluators)
        res = self._fetch_metrics_from_remote_evaluators()
        res = res._replace(info=info)
        return res
Пример #9
0
 def _train(self):
     self.optimizer.step()
     FilterManager.synchronize(self.local_evaluator.filters,
                               self.remote_evaluators)
     result = collect_metrics(self.local_evaluator, self.remote_evaluators)
     result = result._replace(info=self.optimizer.stats())
     return result
Пример #10
0
    def test_synchronize(self):
        """Synchronize applies filter buffer onto own filter"""
        filt1 = MeanStdFilter(())
        for i in range(10):
            filt1(i)
        self.assertEqual(filt1.rs.n, 10)
        filt1.clear_buffer()
        self.assertEqual(filt1.buffer.n, 0)

        RemoteWorker = ray.remote(_MockWorker)
        remote_e = RemoteWorker.remote(sample_count=10)
        remote_e.sample.remote()

        FilterManager.synchronize(
            {
                "obs_filter": filt1,
                "rew_filter": filt1.copy()
            }, [remote_e])

        filters = ray.get(remote_e.get_filters.remote())
        obs_f = filters["obs_filter"]
        self.assertEqual(filt1.rs.n, 20)
        self.assertEqual(filt1.buffer.n, 0)
        self.assertEqual(obs_f.rs.n, filt1.rs.n)
        self.assertEqual(obs_f.buffer.n, filt1.buffer.n)
Пример #11
0
 def _sync_filters_if_needed(self, workers):
     if self.config.get("observation_filter", "NoFilter") != "NoFilter":
         FilterManager.synchronize(
             workers.local_worker().filters,
             workers.remote_workers(),
             update_remote=self.config["synchronize_filters"])
         logger.debug("synchronized filters: {}".format(
             workers.local_worker().filters))
Пример #12
0
 def _train(self):
     prev_steps = self.optimizer.num_steps_sampled
     self.optimizer.step()
     FilterManager.synchronize(self.local_evaluator.filters,
                               self.remote_evaluators)
     result = self.optimizer.collect_metrics()
     result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                   prev_steps)
     return result
Пример #13
0
    def train(self):
        """Overrides super.train to synchronize global vars."""

        if self._has_policy_optimizer():
            self.global_vars["timestep"] = self.optimizer.num_steps_sampled
            self.optimizer.workers.local_worker().set_global_vars(
                self.global_vars)
            for w in self.optimizer.workers.remote_workers():
                w.set_global_vars.remote(self.global_vars)
            logger.debug("updated global vars: {}".format(self.global_vars))

        result = None
        for _ in range(1 + MAX_WORKER_FAILURE_RETRIES):
            try:
                result = Trainable.train(self)
            except RayError as e:
                if self.config["ignore_worker_failures"]:
                    logger.exception(
                        "Error in train call, attempting to recover")
                    self._try_recover()
                else:
                    logger.info(
                        "Worker crashed during call to train(). To attempt to "
                        "continue training without the failed worker, set "
                        "`'ignore_worker_failures': True`.")
                    raise e
            except Exception as e:
                time.sleep(0.5)  # allow logs messages to propagate
                raise e
            else:
                break
        if result is None:
            raise RuntimeError("Failed to recover from worker crash")

        if (self.config.get("observation_filter", "NoFilter") != "NoFilter"
                and hasattr(self, "workers")
                and isinstance(self.workers, WorkerSet)):
            FilterManager.synchronize(
                self.workers.local_worker().filters,
                self.workers.remote_workers(),
                update_remote=self.config["synchronize_filters"])
            logger.debug("synchronized filters: {}".format(
                self.workers.local_worker().filters))

        if self._has_policy_optimizer():
            result["num_healthy_workers"] = len(
                self.optimizer.workers.remote_workers())

        if self.config["evaluation_interval"]:
            if self._iteration % self.config["evaluation_interval"] == 0:
                evaluation_metrics = self._evaluate()
                assert isinstance(evaluation_metrics, dict), \
                    "_evaluate() needs to return a dict."
                result.update(evaluation_metrics)

        return result
Пример #14
0
 def _train(self):
     prev_steps = self.optimizer.num_steps_sampled
     start = time.time()
     while time.time() - start < self.config["min_iter_time_s"]:
         self.optimizer.step()
         FilterManager.synchronize(self.local_evaluator.filters,
                                   self.remote_evaluators)
     result = self.optimizer.collect_metrics()
     result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                   prev_steps)
     return result
Пример #15
0
Файл: ppo.py Проект: zianhu7/ray
 def _train(self):
     prev_steps = self.optimizer.num_steps_sampled
     fetches = self.optimizer.step()
     self.local_evaluator.for_policy(lambda pi: pi.update_kl(fetches["kl"]))
     FilterManager.synchronize(self.local_evaluator.filters,
                               self.remote_evaluators)
     res = self.optimizer.collect_metrics()
     res = res._replace(
         timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps,
         info=dict(fetches, **res.info))
     return res
Пример #16
0
 def _train(self):
     prev_steps = self.optimizer.num_steps_sampled
     fetches = self.optimizer.step()
     if "kl" in fetches:
         # single-agent
         self.local_evaluator.for_policy(
             lambda pi: pi.update_kl(fetches["kl"]))
     else:
         # multi-agent
         self.local_evaluator.foreach_trainable_policy(
             lambda pi, pi_id: pi.update_kl(fetches[pi_id]["kl"]))
     FilterManager.synchronize(self.local_evaluator.filters,
                               self.remote_evaluators)
     res = self.optimizer.collect_metrics()
     res = res._replace(
         timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps,
         info=dict(fetches, **res.info))
     return res
Пример #17
0
    def train(self):
        """Overrides super.train to synchronize global vars."""

        if hasattr(self, "optimizer") and isinstance(self.optimizer,
                                                     PolicyOptimizer):
            self.global_vars["timestep"] = self.optimizer.num_steps_sampled
            self.optimizer.local_evaluator.set_global_vars(self.global_vars)
            for ev in self.optimizer.remote_evaluators:
                ev.set_global_vars.remote(self.global_vars)

        if (self.config.get("observation_filter", "NoFilter") != "NoFilter"
                and hasattr(self, "local_evaluator")):
            FilterManager.synchronize(
                self.local_evaluator.filters,
                self.remote_evaluators,
                update_remote=self.config["synchronize_filters"])

        return Trainable.train(self)
Пример #18
0
Файл: agent.py Проект: zhy52/ray
    def train(self):
        """Overrides super.train to synchronize global vars."""

        if self._has_policy_optimizer():
            self.global_vars["timestep"] = self.optimizer.num_steps_sampled
            self.optimizer.local_evaluator.set_global_vars(self.global_vars)
            for ev in self.optimizer.remote_evaluators:
                ev.set_global_vars.remote(self.global_vars)
            logger.debug("updated global vars: {}".format(self.global_vars))

        result = None
        for _ in range(1 + MAX_WORKER_FAILURE_RETRIES):
            try:
                result = Trainable.train(self)
            except RayError as e:
                if self.config["ignore_worker_failures"]:
                    logger.exception(
                        "Error in train call, attempting to recover")
                    self._try_recover()
                else:
                    logger.info(
                        "Worker crashed during call to train(). To attempt to "
                        "continue training without the failed worker, set "
                        "`'ignore_worker_failures': True`.")
                    raise e
            else:
                break
        if result is None:
            raise RuntimeError("Failed to recover from worker crash")

        if (self.config.get("observation_filter", "NoFilter") != "NoFilter"
                and hasattr(self, "local_evaluator")):
            FilterManager.synchronize(
                self.local_evaluator.filters,
                self.remote_evaluators,
                update_remote=self.config["synchronize_filters"])
            logger.debug("synchronized filters: {}".format(
                self.local_evaluator.filters))

        if self._has_policy_optimizer():
            result["num_healthy_workers"] = len(
                self.optimizer.remote_evaluators)
        return result
Пример #19
0
    def testSynchronize(self):
        """Synchronize applies filter buffer onto own filter"""
        filt1 = MeanStdFilter(())
        for i in range(10):
            filt1(i)
        self.assertEqual(filt1.rs.n, 10)
        filt1.clear_buffer()
        self.assertEqual(filt1.buffer.n, 0)

        RemoteEvaluator = ray.remote(_MockEvaluator)
        remote_e = RemoteEvaluator.remote(sample_count=10)
        remote_e.sample.remote()

        FilterManager.synchronize({
            "obs_filter": filt1,
            "rew_filter": filt1.copy()
        }, [remote_e])

        filters = ray.get(remote_e.get_filters.remote())
        obs_f = filters["obs_filter"]
        self.assertEqual(filt1.rs.n, 20)
        self.assertEqual(filt1.buffer.n, 0)
        self.assertEqual(obs_f.rs.n, filt1.rs.n)
        self.assertEqual(obs_f.buffer.n, filt1.buffer.n)
Пример #20
0
    def _train(self):
        num_pairs = math.ceil(self.candidates_per_iteration / 2)
        candidates: [any
                     ] = [((0, 0), None, None)] * self.num_evals_per_iteration
        for _ in range(num_pairs):
            offset = randint(0, sys.maxsize)
            candidates.append(((offset, 1), None, None))
            candidates.append(((offset, -1), None, None))

        num_candidates = len(candidates)
        dispatch_index: int = 0

        async def manage_worker(worker):
            nonlocal candidates, dispatch_index, result
            while dispatch_index < num_candidates:
                index = dispatch_index
                dispatch_index += 1

                candidate = candidates[index][0]

                score, length = await worker.evaluate.remote(candidate)
                candidates[index] = (candidate, score, length)

        async def run_workers():
            tasks = [
                asyncio.create_task(manage_worker(worker))
                for worker in self._workers
            ]
            for task in tasks:
                await task

        print('run workers begin')
        loop = asyncio.new_event_loop()
        loop.run_until_complete(run_workers())
        print('run workers complete')
        # synchronize on evaluations completed

        theta_evals = candidates[0:self.num_evals_per_iteration]
        candidate_evals = candidates[self.num_evals_per_iteration:]

        # update all workers
        update_completions = [
            worker.update.remote(candidate_evals) for worker in self._workers
        ]

        info = self.common.model_keeper.update(candidate_evals)
        self.common.policy.set_flat_weights(self.common.model_keeper.theta)

        # synchronize on updates completed
        for completion in update_completions:
            ray.get(completion)

        print('update workers complete')

        episodes_this_iteration = len(candidate_evals)
        self.episodes_so_far += episodes_this_iteration

        # Now sync the filters
        FilterManager.synchronize(
            {DEFAULT_POLICY_ID: self.common.policy.observation_filter},
            self._workers)

        def extract_columns(evaluations):
            rewards = np.fromiter(
                (evaluation[1] for evaluation in evaluations),
                dtype=np.float32)
            lengths = np.fromiter(
                (evaluation[2] for evaluation in evaluations), dtype=np.int)
            return rewards, lengths

        theta_rewards, theta_lengths = extract_columns(theta_evals)
        candidate_rewards, candidate_lengths = extract_columns(candidate_evals)

        def impute(func, a, min_size=1):
            return func(a) if a.size >= min_size else math.nan

        def accumulate_distribution_stats(name, a):
            nonlocal info
            info[name + 'mean'] = impute(np.mean, a)
            info[name + 'stdev'] = impute(np.std, a, 2)
            info[name + 'min'] = impute(np.min, a)
            info[name + 'max'] = impute(np.max, a)

        accumulate_distribution_stats('candidate_reward_', candidate_rewards)
        accumulate_distribution_stats('candidate_length_', candidate_lengths)

        accumulate_distribution_stats('best_reward_', theta_rewards)
        accumulate_distribution_stats('best_length_', theta_lengths)

        info['episodes_this_iter'] = episodes_this_iteration
        info['episodes_so_far'] = self.episodes_so_far

        result = {
            'episode_reward_mean': info['best_reward_mean'],
            'episode_len_mean': info['best_length_mean'],
            'timesteps_this_iter': np.sum(candidate_lengths),
            'episode_reward_max': info['best_reward_max'],
            'episode_reward_min': info['best_reward_min'],
            'episodes_this_iter': episodes_this_iteration,
            'episodes_total': self.episodes_so_far,
            'info': info,
        }

        return result
Пример #21
0
    def _train(self):
        config = self.config

        theta = self.theta_dict[self.curr_parent]
        #print('theta shape is {}'.format(np.array(theta).shape))
        self.policy.set_weights(theta)
        assert theta.dtype == np.float32

        # Put the current policy weights in the object store.
        theta_id = ray.put(theta)
        # Use the actors to do rollouts, note that we pass in the ID of the
        # policy weights.
        results, num_episodes, num_timesteps, self.policymax, self.rewmax = self._collect_results(
            theta_id, config["episodes_per_batch"], config["train_batch_size"])

        all_noise_indices = []
        all_training_returns = []
        all_training_lengths = []
        all_eval_returns = []
        all_eval_lengths = []
        all_training_acc_returns = []
        all_eval_acc_returns = []
        all_policy_weight = []
        all_novelty = []
        all_rew_chgs = []
        all_entro_chgs = []
        all_distances = []

        # Loop over the results.
        for result in results:
            all_eval_returns += result.eval_returns
            all_eval_lengths += result.eval_lengths
            all_eval_acc_returns += result.eval_acc_returns

            all_noise_indices += result.noise_indices
            all_training_returns += result.noisy_returns
            all_training_lengths += result.noisy_lengths
            all_training_acc_returns += result.noisy_acc_returns

            all_policy_weight += result.policy_weights

            all_novelty += result.novelty

        assert len(all_eval_returns) == len(all_eval_lengths)
        assert (len(all_noise_indices) == len(all_training_returns) ==
                len(all_training_lengths))

        self.episodes_so_far += num_episodes

        # Assemble the results.
        eval_returns = np.array(all_eval_returns)
        eval_lengths = np.array(all_eval_lengths)
        noise_indices = np.array(all_noise_indices)
        noisy_returns = np.array(all_training_returns)
        noisy_lengths = np.array(all_training_lengths)
        novelty_entropy = np.array(all_novelty)

        eval_acc_returns = np.array(all_eval_acc_returns)
        noisy_acc_returns = np.array(all_training_acc_returns)

        # Process the returns.

        # Compute and take a step.

        #print('enter1')
        population = self.population
        returns_n2 = self.returns_n2
        ret = self.ret
        population.extend(all_policy_weight)
        #returns_n2.extend(all_training_returns)

        returns_n2.extend(all_training_acc_returns)
        ret.extend(all_training_returns)
        population.extend(self.policymax)
        returns_n2.extend(self.maxrew)
        ret.extend(self.maxrew2)

        population2 = np.array(population)
        returns2_n2 = np.array(returns_n2)
        ret2 = np.array(ret)

        returns2_n2, indices = np.unique(returns2_n2, return_index=True)
        #population2=population2[indices]

        ret2, ind = np.unique(ret2, return_index=True)
        population2 = population2[ind]
        #print('enter2')
        #print('population shape is {}'.format(population2.shape))
        print('returns_n2 is {}'.format(returns_n2))
        print('ret2 is {}'.format(ret2))
        print('returns2_n2 shape is {}'.format(returns2_n2.shape))
        if population2.shape[0] >= config["population_size"]:
            if len(returns2_n2.tolist()) >= config["population_size"]:
                idx = np.argpartition(
                    returns2_n2, (-config["population_size"],
                                  -1))[-1:-config["population_size"] - 1:-1]
                #population2 = population2[idx]
                returns2_n2 = returns2_n2[idx]

            if len(ret2.tolist()) >= config["population_size"]:
                idx2 = np.argpartition(
                    ret2, (-config["population_size"],
                           -1))[-1:-config["population_size"] - 1:-1]
                ret2 = ret2[idx2]
                population2 = population2[idx2]

        #print('enter3')
        theta = population2[0][0]
        #print('shape of theta is {}'.format(theta.shape))
        self.population = population2.tolist()
        self.returns_n2 = returns2_n2.tolist()
        self.ret = ret2.tolist()

        print("returns_n2 is {}".format(self.returns_n2))
        print("ret2 is {}".format(ret2))

        g = -1000
        update_ratio = -1000

        # Set the new weights in the local copy of the policy.
        self.policy.set_weights(theta)
        self.theta_dict[self.curr_parent] = self.policy.get_weights()

        # Store the rewards
        if len(all_eval_returns) > 0:
            if self.curr_parent == 0:
                self.reward_list1.append(np.mean(eval_returns))

            if self.curr_parent == 1:
                self.reward_list2.append(np.mean(eval_returns))

            if self.curr_parent == 2:
                self.reward_list3.append(np.mean(eval_returns))

        # Now sync the filters
        FilterManager.synchronize({"default": self.policy.get_filter()},
                                  self.workers)

        info = {
            "weights_norm": np.square(theta).sum(),
            "grad_norm": np.square(g).sum(),
            "update_ratio": update_ratio,
            "episodes_this_iter": noisy_lengths.size,
            "episodes_so_far": self.episodes_so_far,
        }
        #self.iteration

        if self.curr_parent == 0:
            self.reward_mean1 = np.mean(
                self.reward_list1[-self.report_length:])
            # if len(self.reward_list1)>=self.report_length:
            #   reward_max1=np.max(self.reward_list1[-self.report_length:])
            #   reward_min1=np.min(self.reward_list1[-self.report_length:])
        if self.curr_parent == 1:
            self.reward_mean2 = np.mean(
                self.reward_list2[-self.report_length:])
            # if len(self.reward_list2)>=self.report_length:
            #   reward_max2=np.max(self.reward_list2[-self.report_length:])
            #   reward_min2=np.min(self.reward_list2[-self.report_length:])
        if self.curr_parent == 2:
            self.reward_mean3 = np.mean(
                self.reward_list3[-self.report_length:])
        # if len(self.reward_list3)>=self.report_length:
        #  reward_max3=np.max(self.reward_list3[-self.report_length:])
        #  reward_min3=np.min(self.reward_list3[-self.report_length:])
        # reward_mean_noise=np.mean(all_training_returns)
        # reward_max_noise=np.max(all_training_returns)
        # reward_min_noise=np.min(all_training_returns)
        result = dict(
            #episode_reward_min1=reward_min1,
            episode_reward_mean1=self.reward_mean1,
            #episode_reward_max1=reward_max1,
            #episode_reward_min2=reward_min2,
            episode_reward_mean2=self.reward_mean2,
            #episode_reward_max2=reward_max2,
            #episode_reward_min3=reward_min3,
            episode_reward_mean3=self.reward_mean3,
            #episode_reward_max3=reward_max3,
            # noise_reward_min=reward_min_noise,
            # noise_reward_mean=reward_mean_noise,
            # noise_reward_max=reward_max_noise,
            episode_len_mean=eval_lengths.mean(),
            timesteps_this_iter=noisy_lengths.sum(),
            info=info)
        self.curr_parent = (self.curr_parent + 1) % config["pop_size"]

        return result
Пример #22
0
 def _train(self):
     self.optimizer.step()
     FilterManager.synchronize(
         self.local_evaluator.filters, self.remote_evaluators)
     return collect_metrics(self.local_evaluator, self.remote_evaluators)
Пример #23
0
    def train(self):
        agents = self.remote_evaluators
        config = self.config
        model = self.local_evaluator

        if (config["num_workers"] * config["min_steps_per_task"] >
                config["timesteps_per_batch"]):
            print(
                "WARNING: num_workers * min_steps_per_task > "
                "timesteps_per_batch. This means that the output of some "
                "tasks will be wasted. Consider decreasing "
                "min_steps_per_task or increasing timesteps_per_batch.")

        while self.global_step < self.config['num_batches']:

            iter_start = time.time()
            weights = ray.put(model.get_weights())
            [a.set_weights.remote(weights) for a in agents]

            samples = collect_samples(agents, config, self.local_evaluator)


            def standardized(value):
                # Divide by the maximum of value.std() and 1e-4
                # to guard against the case where all values are equal
                return (value - value.mean()) / max(1e-4, value.std())

            samples.data["advantages"] = standardized(samples["advantages"])

            rollouts_end = time.time()
            print("Computing policy (iterations=" + str(config["num_sgd_iter"]) +
                  ", stepsize=" + str(config["sgd_stepsize"]) + "):")
            samples.shuffle()
            shuffle_end = time.time()
            # tuples_per_device = model.load_data(
            #     samples, self.iteration == 0 and config["full_trace_data_load"])
            tuples_per_device = model.load_data(
                samples, config["full_trace_data_load"])
            load_end = time.time()
            rollouts_time = rollouts_end - iter_start
            shuffle_time = shuffle_end - rollouts_end
            load_time = load_end - shuffle_end
            sgd_time = 0
            for i in range(config["num_sgd_iter"]):
                sgd_start = time.time()
                batch_index = 0
                num_batches = (
                    int(tuples_per_device) // int(model.per_device_batch_size))
                permutation = np.random.permutation(num_batches)
                while batch_index < num_batches:
                    model.run_sgd_minibatch(permutation[batch_index] * model.per_device_batch_size,
                                            self.kl_coeff, False,
                                            self.file_writer)
                    batch_index += 1
                sgd_end = time.time()
                sgd_time += sgd_end - sgd_start

            self.global_step += 1
            # if kl > 2.0 * config["kl_target"]:
            #     self.kl_coeff *= 1.5
            # elif kl < 0.5 * config["kl_target"]:
            #     self.kl_coeff *= 0.5

            FilterManager.synchronize(
                self.local_evaluator.filters, self.remote_evaluators)

            info = {
                # "kl_divergence": kl,
                # "kl_coefficient": self.kl_coeff,
                "rollouts_time": rollouts_time,
                "shuffle_time": shuffle_time,
                "load_time": load_time,
                "sgd_time": sgd_time,
                "sample_throughput": len(samples["observations"]) / sgd_time
            }
            print(info)


            if self.global_step % self.config['batches_per_save'] == 0:
                self._save()

            if self.global_step % self.config['batches_per_evaluate'] == 0:
                pl, vl, ent, kl, e_pl, e_vl, e_ent, e_kl, rew, leng = \
                    self.local_evaluator.get_evaluate_metrics()

                stats = tf.Summary(value=[
                    tf.Summary.Value(tag="reward", simple_value=rew),
                    tf.Summary.Value(tag="episode_length", simple_value=leng),
                    tf.Summary.Value(tag="policy_loss", simple_value=pl),
                    tf.Summary.Value(tag="value_loss", simple_value=vl),
                    tf.Summary.Value(tag="entropy", simple_value=ent),
                    tf.Summary.Value(tag="kl", simple_value=kl),
                    tf.Summary.Value(tag="e_policy_loss", simple_value=e_pl),
                    tf.Summary.Value(tag="e_value_loss", simple_value=e_vl),
                    tf.Summary.Value(tag="e_entropy", simple_value=e_ent),
                    tf.Summary.Value(tag="e_kl", simple_value=e_kl),
                ],)
                self.file_writer.add_summary(stats, self.global_step)
Пример #24
0
    def _train(self):
        config = self.config
        # Here the iteration starts
        # Create the random environments configurations for each iteration
        logger.info("Creating random environment configurations")
        # ceil(config["num_rollouts"]/config["num_workers"]*2)*config["num_workers"]
        #   is the exact number of environments created per iteration as the iteration is incremently
        #   increase by 2 (one for positive and one for negative perturbation) for each worker
        #   For each positive and negative perturbation, the same environment
        #   But if we do this it will be corrolated with number of rollouts and we will have less number of rollouts with different perturbations
        random_env_config_id = create_random_env_configs.remote(self.extra_config["num_randomized_envs"], self.domain_randomization_config)
        self.random_env_config = SharedRandomEnvConfigsTable(ray.get(random_env_config_id))
        for worker in self.workers:
            worker.setRandomEnvConfig.remote(random_env_config_id)

        theta = self.policy.get_weights()
        assert theta.dtype == np.float32

        # Put the current policy weights in the object store.
        theta_id = ray.put(theta)
        # Use the actors to do rollouts, note that we pass in the ID of the
        # policy weights.
        results, num_episodes, num_timesteps = self._collect_results(
            theta_id, config["num_rollouts"])

        all_noise_indices = []
        all_training_returns = []
        all_training_lengths = []
        all_eval_returns = []
        all_eval_lengths = []

        # Loop over the results.
        for result in results:
            all_eval_returns += result.eval_returns
            all_eval_lengths += result.eval_lengths

            all_noise_indices += result.noise_indices
            all_training_returns += result.noisy_returns
            all_training_lengths += result.noisy_lengths

        assert len(all_eval_returns) == len(all_eval_lengths)
        assert (len(all_noise_indices) == len(all_training_returns) ==
                len(all_training_lengths))

        self.episodes_so_far += num_episodes

        # Assemble the results.
        eval_returns = np.array(all_eval_returns)
        eval_lengths = np.array(all_eval_lengths)
        noise_indices = np.array(all_noise_indices)
        noisy_returns = np.array(all_training_returns)
        noisy_lengths = np.array(all_training_lengths)

        # keep only the best returns
        # select top performing directions if rollouts_used < num_rollouts
        max_rewards = np.max(noisy_returns, axis=1)
        if self.rollouts_used > self.num_rollouts:
            self.rollouts_used = self.num_rollouts

        percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts))
        idx = np.arange(max_rewards.size)[
            max_rewards >= np.percentile(max_rewards, percentile)]
        noise_idx = noise_indices[idx]
        noisy_returns = noisy_returns[idx, :]


        # Compute and take a step.          It means take a step in changing the theta not take an action
        g, count = utils.batched_weighted_sum(
            noisy_returns[:, 0] - noisy_returns[:, 1],
            (self.noise.get(index, self.policy.num_params)
             for index in noise_idx),
            batch_size=min(500, noisy_returns[:, 0].size))
        g /= noise_idx.size
        # scale the returns by their standard deviation
        if not np.isclose(np.std(noisy_returns), 0.0):
            g /= np.std(noisy_returns)
        assert (g.shape == (self.policy.num_params, )
                and g.dtype == np.float32)
        # Compute the new weights theta.
        theta, update_ratio = self.optimizer.update(-g)
        # Set the new weights in the local copy of the policy.
        self.policy.set_weights(theta)
        # update the reward list
        if len(all_eval_returns) > 0:
            self.reward_list.append(eval_returns.mean())

        # Now sync the filters
        FilterManager.synchronize({
            DEFAULT_POLICY_ID: self.policy.get_filter()
        }, self.workers)

        info = {
            "weights_norm": np.square(theta).sum(),
            "weights_std": np.std(theta),
            "grad_norm": np.square(g).sum(),
            "update_ratio": update_ratio,
            "episodes_this_iter": noisy_lengths.size,
            "episodes_so_far": self.episodes_so_far,
        }
        result = dict(
            episode_reward_mean=np.mean(
                self.reward_list[-self.report_length:]),
            episode_len_mean=eval_lengths.mean(),
            timesteps_this_iter=noisy_lengths.sum(),
            info=info)

        return result
Пример #25
0
Файл: a3c.py Проект: xieydd/ray
 def _train(self):
     self.optimizer.step()
     FilterManager.synchronize(self.local_evaluator.filters,
                               self.remote_evaluators)
     res = self._fetch_metrics_from_remote_evaluators()
     return res
Пример #26
0
    def _train(self):
        agents = self.remote_evaluators
        config = self.config
        model = self.local_evaluator

        if (config["num_workers"] * config["min_steps_per_task"] >
                config["timesteps_per_batch"]):
            print(
                "WARNING: num_workers * min_steps_per_task > "
                "timesteps_per_batch. This means that the output of some "
                "tasks will be wasted. Consider decreasing "
                "min_steps_per_task or increasing timesteps_per_batch.")

        print("===> iteration", self.iteration)

        iter_start = time.time()
        weights = ray.put(model.get_weights())
        [a.set_weights.remote(weights) for a in agents]
        samples = collect_samples(agents, config, self.local_evaluator)

        def standardized(value):
            # Divide by the maximum of value.std() and 1e-4
            # to guard against the case where all values are equal
            return (value - value.mean()) / max(1e-4, value.std())

        samples.data["advantages"] = standardized(samples["advantages"])

        rollouts_end = time.time()
        print("Computing policy (iterations=" + str(config["num_sgd_iter"]) +
              ", stepsize=" + str(config["sgd_stepsize"]) + "):")
        names = [
            "iter", "total loss", "policy loss", "vf loss", "kl", "entropy"]
        print(("{:>15}" * len(names)).format(*names))
        samples.shuffle()
        shuffle_end = time.time()
        tuples_per_device = model.load_data(
            samples, self.iteration == 0 and config["full_trace_data_load"])
        load_end = time.time()
        rollouts_time = rollouts_end - iter_start
        shuffle_time = shuffle_end - rollouts_end
        load_time = load_end - shuffle_end
        sgd_time = 0
        for i in range(config["num_sgd_iter"]):
            sgd_start = time.time()
            batch_index = 0
            num_batches = (
                int(tuples_per_device) // int(model.per_device_batch_size))
            loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], []
            permutation = np.random.permutation(num_batches)
            # Prepare to drop into the debugger
            if self.iteration == config["tf_debug_iteration"]:
                model.sess = tf_debug.LocalCLIDebugWrapperSession(model.sess)
            while batch_index < num_batches:
                full_trace = (
                    i == 0 and self.iteration == 0 and
                    batch_index == config["full_trace_nth_sgd_batch"])
                batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \
                    batch_entropy = model.run_sgd_minibatch(
                        permutation[batch_index] * model.per_device_batch_size,
                        self.kl_coeff, full_trace,
                        self.file_writer)
                loss.append(batch_loss)
                policy_loss.append(batch_policy_loss)
                vf_loss.append(batch_vf_loss)
                kl.append(batch_kl)
                entropy.append(batch_entropy)
                batch_index += 1
            loss = np.mean(loss)
            policy_loss = np.mean(policy_loss)
            vf_loss = np.mean(vf_loss)
            kl = np.mean(kl)
            entropy = np.mean(entropy)
            sgd_end = time.time()
            print(
                "{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format(
                    i, loss, policy_loss, vf_loss, kl, entropy))

            values = []
            if i == config["num_sgd_iter"] - 1:
                metric_prefix = "ppo/sgd/final_iter/"
                values.append(tf.Summary.Value(
                    tag=metric_prefix + "kl_coeff",
                    simple_value=self.kl_coeff))
                values.extend([
                    tf.Summary.Value(
                        tag=metric_prefix + "mean_entropy",
                        simple_value=entropy),
                    tf.Summary.Value(
                        tag=metric_prefix + "mean_loss",
                        simple_value=loss),
                    tf.Summary.Value(
                        tag=metric_prefix + "mean_kl",
                        simple_value=kl)])
                if self.file_writer:
                    sgd_stats = tf.Summary(value=values)
                    self.file_writer.add_summary(sgd_stats, self.global_step)
            self.global_step += 1
            sgd_time += sgd_end - sgd_start
        if kl > 2.0 * config["kl_target"]:
            self.kl_coeff *= 1.5
        elif kl < 0.5 * config["kl_target"]:
            self.kl_coeff *= 0.5

        info = {
            "kl_divergence": kl,
            "kl_coefficient": self.kl_coeff,
            "rollouts_time": rollouts_time,
            "shuffle_time": shuffle_time,
            "load_time": load_time,
            "sgd_time": sgd_time,
            "sample_throughput": len(samples["observations"]) / sgd_time
        }

        FilterManager.synchronize(
            self.local_evaluator.filters, self.remote_evaluators)
        res = self._fetch_metrics_from_remote_evaluators()
        res = res._replace(info=info)
        return res
Пример #27
0
    def _train(self):
        agents = self.remote_evaluators
        config = self.config
        model = self.local_evaluator

        print("===> iteration", self.iteration)

        iter_start = time.time()
        weights = ray.put(model.get_weights())
        [a.set_weights.remote(weights) for a in agents]
        samples = collect_samples(agents, config, self.local_evaluator)

        def standardized(value):
            # Divide by the maximum of value.std() and 1e-4
            # to guard against the case where all values are equal
            return (value - value.mean()) / max(1e-4, value.std())

        samples.data["advantages"] = standardized(samples["advantages"])

        rollouts_end = time.time()
        print("Computing policy (iterations=" + str(config["num_sgd_iter"]) +
              ", stepsize=" + str(config["sgd_stepsize"]) + "):")
        names = [
            "iter", "total loss", "policy loss", "vf loss", "kl", "entropy"
        ]
        print(("{:>15}" * len(names)).format(*names))
        samples.shuffle()
        shuffle_end = time.time()
        tuples_per_device = model.load_data(
            samples, self.iteration == 0 and config["full_trace_data_load"])
        load_end = time.time()
        rollouts_time = rollouts_end - iter_start
        shuffle_time = shuffle_end - rollouts_end
        load_time = load_end - shuffle_end
        sgd_time = 0
        for i in range(config["num_sgd_iter"]):
            sgd_start = time.time()
            batch_index = 0
            num_batches = (int(tuples_per_device) //
                           int(model.per_device_batch_size))
            loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], []
            permutation = np.random.permutation(num_batches)
            # Prepare to drop into the debugger
            if self.iteration == config["tf_debug_iteration"]:
                model.sess = tf_debug.LocalCLIDebugWrapperSession(model.sess)
            while batch_index < num_batches:
                full_trace = (i == 0 and self.iteration == 0 and batch_index
                              == config["full_trace_nth_sgd_batch"])
                batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \
                    batch_entropy = model.run_sgd_minibatch(
                        permutation[batch_index] * model.per_device_batch_size,
                        self.kl_coeff, full_trace,
                        self.file_writer)
                loss.append(batch_loss)
                policy_loss.append(batch_policy_loss)
                vf_loss.append(batch_vf_loss)
                kl.append(batch_kl)
                entropy.append(batch_entropy)
                batch_index += 1
            loss = np.mean(loss)
            policy_loss = np.mean(policy_loss)
            vf_loss = np.mean(vf_loss)
            kl = np.mean(kl)
            entropy = np.mean(entropy)
            sgd_end = time.time()
            print("{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format(
                i, loss, policy_loss, vf_loss, kl, entropy))

            values = []
            if i == config["num_sgd_iter"] - 1:
                metric_prefix = "ppo/sgd/final_iter/"
                values.append(
                    tf.Summary.Value(tag=metric_prefix + "kl_coeff",
                                     simple_value=self.kl_coeff))
                values.extend([
                    tf.Summary.Value(tag=metric_prefix + "mean_entropy",
                                     simple_value=entropy),
                    tf.Summary.Value(tag=metric_prefix + "mean_loss",
                                     simple_value=loss),
                    tf.Summary.Value(tag=metric_prefix + "mean_kl",
                                     simple_value=kl)
                ])
                if self.file_writer:
                    sgd_stats = tf.Summary(value=values)
                    self.file_writer.add_summary(sgd_stats, self.global_step)
            self.global_step += 1
            sgd_time += sgd_end - sgd_start
        if kl > 2.0 * config["kl_target"]:
            self.kl_coeff *= 1.5
        elif kl < 0.5 * config["kl_target"]:
            self.kl_coeff *= 0.5

        info = {
            "kl_divergence": kl,
            "kl_coefficient": self.kl_coeff,
            "rollouts_time": rollouts_time,
            "shuffle_time": shuffle_time,
            "load_time": load_time,
            "sgd_time": sgd_time,
            "sample_throughput": len(samples["observations"]) / sgd_time
        }

        FilterManager.synchronize(self.local_evaluator.filters,
                                  self.remote_evaluators)
        res = self._fetch_metrics_from_remote_evaluators()
        res = res._replace(info=info)

        return res
Пример #28
0
    def _train(self):
        config = self.config

        theta = self.policy.get_weights()
        assert theta.dtype == np.float32

        # Put the current policy weights in the object store.
        theta_id = ray.put(theta)
        # Use the actors to do rollouts, note that we pass in the ID of the
        # policy weights.
        results, num_episodes, num_timesteps = self._collect_results(
            theta_id, config["num_rollouts"])

        all_noise_indices = []
        all_training_returns = []
        all_training_lengths = []
        all_eval_returns = []
        all_eval_lengths = []

        # Loop over the results.
        for result in results:
            all_eval_returns += result.eval_returns
            all_eval_lengths += result.eval_lengths

            all_noise_indices += result.noise_indices
            all_training_returns += result.noisy_returns
            all_training_lengths += result.noisy_lengths

        assert len(all_eval_returns) == len(all_eval_lengths)
        assert (len(all_noise_indices) == len(all_training_returns) ==
                len(all_training_lengths))

        self.episodes_so_far += num_episodes

        # Assemble the results.
        eval_returns = np.array(all_eval_returns)
        eval_lengths = np.array(all_eval_lengths)
        noise_indices = np.array(all_noise_indices)
        noisy_returns = np.array(all_training_returns)
        noisy_lengths = np.array(all_training_lengths)

        # keep only the best returns
        # select top performing directions if rollouts_used < num_rollouts
        max_rewards = np.max(noisy_returns, axis=1)
        if self.rollouts_used > self.num_rollouts:
            self.rollouts_used = self.num_rollouts

        percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts))
        idx = np.arange(max_rewards.size)[
            max_rewards >= np.percentile(max_rewards, percentile)]
        noise_idx = noise_indices[idx]
        noisy_returns = noisy_returns[idx, :]

        # Compute and take a step.
        g, count = utils.batched_weighted_sum(
            noisy_returns[:, 0] - noisy_returns[:, 1],
            (self.noise.get(index, self.policy.num_params)
             for index in noise_idx),
            batch_size=min(500, noisy_returns[:, 0].size))
        g /= noise_idx.size
        # scale the returns by their standard deviation
        if not np.isclose(np.std(noisy_returns), 0.0):
            g /= np.std(noisy_returns)
        assert (g.shape == (self.policy.num_params, )
                and g.dtype == np.float32)
        # Compute the new weights theta.
        theta, update_ratio = self.optimizer.update(-g)
        # Set the new weights in the local copy of the policy.
        self.policy.set_weights(theta)
        # update the reward list
        if len(all_eval_returns) > 0:
            self.reward_list.append(eval_returns.mean())

        # Now sync the filters
        FilterManager.synchronize({
            "default": self.policy.get_filter()
        }, self.workers)

        info = {
            "weights_norm": np.square(theta).sum(),
            "weights_std": np.std(theta),
            "grad_norm": np.square(g).sum(),
            "update_ratio": update_ratio,
            "episodes_this_iter": noisy_lengths.size,
            "episodes_so_far": self.episodes_so_far,
        }
        result = dict(
            episode_reward_mean=np.mean(
                self.reward_list[-self.report_length:]),
            episode_len_mean=eval_lengths.mean(),
            timesteps_this_iter=noisy_lengths.sum(),
            info=info)

        return result
Пример #29
0
    def _train(self):
        config = self.config

        theta = self.policy.get_weights()
        assert theta.dtype == np.float32

        # Put the current policy weights in the object store.
        theta_id = ray.put(theta)
        # Use the actors to do rollouts, note that we pass in the ID of the
        # policy weights.
        results, num_episodes, num_timesteps = self._collect_results(
            theta_id, config["episodes_per_batch"], config["train_batch_size"])

        all_noise_indices = []
        all_training_returns = []
        all_training_lengths = []
        all_eval_returns = []
        all_eval_lengths = []

        # Loop over the results.
        for result in results:
            all_eval_returns += result.eval_returns
            all_eval_lengths += result.eval_lengths

            all_noise_indices += result.noise_indices
            all_training_returns += result.noisy_returns
            all_training_lengths += result.noisy_lengths

        assert len(all_eval_returns) == len(all_eval_lengths)
        assert (len(all_noise_indices) == len(all_training_returns) ==
                len(all_training_lengths))

        self.episodes_so_far += num_episodes

        # Assemble the results.
        eval_returns = np.array(all_eval_returns)
        eval_lengths = np.array(all_eval_lengths)
        noise_indices = np.array(all_noise_indices)
        noisy_returns = np.array(all_training_returns)
        noisy_lengths = np.array(all_training_lengths)

        # Process the returns.
        if config["return_proc_mode"] == "centered_rank":
            proc_noisy_returns = utils.compute_centered_ranks(noisy_returns)
        else:
            raise NotImplementedError(config["return_proc_mode"])

        # Compute and take a step.
        g, count = utils.batched_weighted_sum(
            proc_noisy_returns[:, 0] - proc_noisy_returns[:, 1],
            (self.noise.get(index, self.policy.num_params)
             for index in noise_indices),
            batch_size=500)
        g /= noisy_returns.size
        assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
                and count == len(noise_indices))
        # Compute the new weights theta.
        theta, update_ratio = self.optimizer.update(-g +
                                                    config["l2_coeff"] * theta)
        # Set the new weights in the local copy of the policy.
        self.policy.set_weights(theta)
        # Store the rewards
        if len(all_eval_returns) > 0:
            self.reward_list.append(np.mean(eval_returns))

        # Now sync the filters
        FilterManager.synchronize({
            "default": self.policy.get_filter()
        }, self.workers)

        info = {
            "weights_norm": np.square(theta).sum(),
            "grad_norm": np.square(g).sum(),
            "update_ratio": update_ratio,
            "episodes_this_iter": noisy_lengths.size,
            "episodes_so_far": self.episodes_so_far,
        }

        reward_mean = np.mean(self.reward_list[-self.report_length:])
        result = dict(
            episode_reward_mean=reward_mean,
            episode_len_mean=eval_lengths.mean(),
            timesteps_this_iter=noisy_lengths.sum(),
            info=info)

        return result
Пример #30
0
    def _train(self):
        optimizer: CoordinatedOptimizer = self.common.optimizer
        candidates = optimizer.ask()
        best_message = optimizer.best()
        for i in range(self.num_evals_per_iteration):
            candidates.append(best_message)

        num_candidates = len(candidates)
        dispatch_index: int = num_candidates - 1

        async def manage_worker(worker):
            nonlocal candidates, dispatch_index, result
            while dispatch_index >= 0:
                index = dispatch_index
                dispatch_index -= 1
                candidate = candidates[index]

                score, length = await worker.evaluate.remote(candidate)
                candidates[index] = (score, candidate, length)

        async def run_workers():
            workers = self._workers
            num_workers = len(workers)
            tasks = [
                asyncio.create_task(manage_worker(workers[i % num_workers]))
                for i in range(self.request_interleaving * num_workers)
            ]
            for task in tasks:  # synchronize on evaluations completed
                await task

        loop = asyncio.new_event_loop()
        loop.run_until_complete(run_workers())

        theta_evals = candidates[-self.num_evals_per_iteration:]
        candidate_evals = candidates[0:-self.num_evals_per_iteration]

        # update all workers
        update_completions = [
            worker.tell.remote(candidate_evals) for worker in self._workers
        ]
        optimizer.tell_messages(candidate_evals)

        self.common.policy.set_flat_weights(optimizer.best_candidate())

        # synchronize on updates completed
        for completion in update_completions:
            ray.get(completion)

        episodes_this_iteration = len(candidate_evals)
        self.episodes_so_far += episodes_this_iteration

        # Now sync the filters
        FilterManager.synchronize(
            {DEFAULT_POLICY_ID: self.common.policy.observation_filter},
            self._workers)
        # FilterManager.synchronize({
        #     DEFAULT_POLICY_ID: self.common.policy.get_filter()
        #     }, self._workers)

        info = optimizer.status()

        def extract_columns(evaluations):
            rewards = np.fromiter(
                (evaluation[0] for evaluation in evaluations),
                dtype=np.float32)
            lengths = np.fromiter(
                (evaluation[2] for evaluation in evaluations), dtype=np.int)
            return rewards, lengths

        theta_rewards, theta_lengths = extract_columns(theta_evals)
        candidate_rewards, candidate_lengths = extract_columns(candidate_evals)

        def impute(func, a, min_size=1):
            return func(a) if a.size >= min_size else math.nan

        def accumulate_distribution_stats(name, a):
            nonlocal info
            info[name + 'mean'] = impute(np.mean, a)
            info[name + 'stdev'] = impute(np.std, a, 2)
            info[name + 'min'] = impute(np.min, a)
            info[name + 'max'] = impute(np.max, a)

        accumulate_distribution_stats('candidate_reward_', candidate_rewards)
        accumulate_distribution_stats('candidate_length_', candidate_lengths)

        accumulate_distribution_stats('best_reward_', theta_rewards)
        accumulate_distribution_stats('best_length_', theta_lengths)

        info['episodes_this_iter'] = episodes_this_iteration
        info['episodes_so_far'] = self.episodes_so_far

        result = {
            'episode_reward_mean': info['best_reward_mean'],
            'episode_len_mean': info['best_length_mean'],
            'timesteps_this_iter': np.sum(candidate_lengths),
            'episode_reward_max': info['best_reward_max'],
            'episode_reward_min': info['best_reward_min'],
            'episodes_this_iter': episodes_this_iteration,
            'episodes_total': self.episodes_so_far,
            'info': info,
        }

        return result
Пример #31
0
    def step(self):
        config = self.config

        theta = self.policy.get_flat_weights()
        assert theta.dtype == np.float32
        assert len(theta.shape) == 1

        # Put the current policy weights in the object store.
        theta_id = ray.put(theta)
        # Use the actors to do rollouts, note that we pass in the ID of the
        # policy weights.
        results, num_episodes, num_timesteps = self._collect_results(
            theta_id, config["num_rollouts"])

        all_noise_indices = []
        all_training_returns = []
        all_training_lengths = []
        all_eval_returns = []
        all_eval_lengths = []

        # Loop over the results.
        for result in results:
            all_eval_returns += result.eval_returns
            all_eval_lengths += result.eval_lengths

            all_noise_indices += result.noise_indices
            all_training_returns += result.noisy_returns
            all_training_lengths += result.noisy_lengths

        assert len(all_eval_returns) == len(all_eval_lengths)
        assert (len(all_noise_indices) == len(all_training_returns) ==
                len(all_training_lengths))

        self.episodes_so_far += num_episodes

        # Assemble the results.
        eval_returns = np.array(all_eval_returns)
        eval_lengths = np.array(all_eval_lengths)
        noise_indices = np.array(all_noise_indices)
        noisy_returns = np.array(all_training_returns)
        noisy_lengths = np.array(all_training_lengths)

        # keep only the best returns
        # select top performing directions if rollouts_used < num_rollouts
        max_rewards = np.max(noisy_returns, axis=1)
        if self.rollouts_used > self.num_rollouts:
            self.rollouts_used = self.num_rollouts

        percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts))
        idx = np.arange(max_rewards.size)[
            max_rewards >= np.percentile(max_rewards, percentile)]
        noise_idx = noise_indices[idx]
        noisy_returns = noisy_returns[idx, :]

        # Compute and take a step.
        g, count = utils.batched_weighted_sum(
            noisy_returns[:, 0] - noisy_returns[:, 1],
            (self.noise.get(index, self.policy.num_params)
             for index in noise_idx),
            batch_size=min(500, noisy_returns[:, 0].size))
        g /= noise_idx.size
        # scale the returns by their standard deviation
        if not np.isclose(np.std(noisy_returns), 0.0):
            g /= np.std(noisy_returns)
        assert (g.shape == (self.policy.num_params, )
                and g.dtype == np.float32)
        # Compute the new weights theta.
        theta, update_ratio = self.optimizer.update(-g)
        # Set the new weights in the local copy of the policy.
        self.policy.set_flat_weights(theta)
        # update the reward list
        if len(all_eval_returns) > 0:
            self.reward_list.append(eval_returns.mean())

        # Now sync the filters
        FilterManager.synchronize(
            {DEFAULT_POLICY_ID: self.policy.observation_filter}, self.workers)

        info = {
            "weights_norm": np.square(theta).sum(),
            "weights_std": np.std(theta),
            "grad_norm": np.square(g).sum(),
            "update_ratio": update_ratio,
            "episodes_this_iter": noisy_lengths.size,
            "episodes_so_far": self.episodes_so_far,
        }
        result = dict(episode_reward_mean=np.mean(
            self.reward_list[-self.report_length:]),
                      episode_len_mean=eval_lengths.mean(),
                      timesteps_this_iter=noisy_lengths.sum(),
                      info=info)

        return result
Пример #32
0
    def step_attempt(self):
        config = self.config

        theta = self.policy.get_flat_weights()
        assert theta.dtype == np.float32
        assert len(theta.shape) == 1

        # Put the current policy weights in the object store.
        theta_id = ray.put(theta)
        # Use the actors to do rollouts, note that we pass in the ID of the
        # policy weights.
        results, num_episodes, num_timesteps = self._collect_results(
            theta_id, config["episodes_per_batch"], config["train_batch_size"])

        all_noise_indices = []
        all_training_returns = []
        all_training_lengths = []
        all_eval_returns = []
        all_eval_lengths = []

        # Loop over the results.
        for result in results:
            all_eval_returns += result.eval_returns
            all_eval_lengths += result.eval_lengths

            all_noise_indices += result.noise_indices
            all_training_returns += result.noisy_returns
            all_training_lengths += result.noisy_lengths

        assert len(all_eval_returns) == len(all_eval_lengths)
        assert (len(all_noise_indices) == len(all_training_returns) ==
                len(all_training_lengths))

        self.episodes_so_far += num_episodes

        # Assemble the results.
        eval_returns = np.array(all_eval_returns)
        eval_lengths = np.array(all_eval_lengths)
        noise_indices = np.array(all_noise_indices)
        noisy_returns = np.array(all_training_returns)
        noisy_lengths = np.array(all_training_lengths)

        # Process the returns.
        if config["return_proc_mode"] == "centered_rank":
            proc_noisy_returns = utils.compute_centered_ranks(noisy_returns)
        else:
            raise NotImplementedError(config["return_proc_mode"])

        # Compute and take a step.
        g, count = utils.batched_weighted_sum(
            proc_noisy_returns[:, 0] - proc_noisy_returns[:, 1],
            (self.noise.get(index, self.policy.num_params)
             for index in noise_indices),
            batch_size=500)
        g /= noisy_returns.size
        assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
                and count == len(noise_indices))
        # Compute the new weights theta.
        theta, update_ratio = self.optimizer.update(-g +
                                                    config["l2_coeff"] * theta)
        # Set the new weights in the local copy of the policy.
        self.policy.set_flat_weights(theta)
        # Store the rewards
        if len(all_eval_returns) > 0:
            self.reward_list.append(np.mean(eval_returns))

        # Now sync the filters
        FilterManager.synchronize(
            {DEFAULT_POLICY_ID: self.policy.observation_filter}, self._workers)

        info = {
            "weights_norm": np.square(theta).sum(),
            "grad_norm": np.square(g).sum(),
            "update_ratio": update_ratio,
            "episodes_this_iter": noisy_lengths.size,
            "episodes_so_far": self.episodes_so_far,
        }

        reward_mean = np.mean(self.reward_list[-self.report_length:])
        result = dict(episode_reward_mean=reward_mean,
                      episode_len_mean=eval_lengths.mean(),
                      timesteps_this_iter=noisy_lengths.sum(),
                      info=info)

        return result
Пример #33
0
 def _train(self):
     self.optimizer.step()
     FilterManager.synchronize(
         self.local_evaluator.filters, self.remote_evaluators)
     res = self._fetch_metrics_from_remote_evaluators()
     return res