def train(self): """Overrides super.train to synchronize global vars.""" if hasattr(self, "optimizer") and isinstance(self.optimizer, PolicyOptimizer): self.global_vars["timestep"] = self.optimizer.num_steps_sampled self.optimizer.local_evaluator.set_global_vars(self.global_vars) for ev in self.optimizer.remote_evaluators: ev.set_global_vars.remote(self.global_vars) logger.debug("updated global vars: {}".format(self.global_vars)) if (self.config.get("observation_filter", "NoFilter") != "NoFilter" and hasattr(self, "local_evaluator")): FilterManager.synchronize( self.local_evaluator.filters, self.remote_evaluators, update_remote=self.config["synchronize_filters"]) logger.debug("synchronized filters: {}".format( self.local_evaluator.filters)) result = Trainable.train(self) if self.config["callbacks"].get("on_train_result"): self.config["callbacks"]["on_train_result"]({ "agent": self, "result": result, }) return result
def _train(self): def postprocess_samples(batch): # Divide by the maximum of value.std() and 1e-4 # to guard against the case where all values are equal value = batch["advantages"] standardized = (value - value.mean()) / max(1e-4, value.std()) batch.data["advantages"] = standardized batch.shuffle() dummy = np.zeros_like(batch["advantages"]) if not self.config["use_gae"]: batch.data["value_targets"] = dummy batch.data["vf_preds"] = dummy extra_fetches = self.optimizer.step(postprocess_fn=postprocess_samples) kl = np.array(extra_fetches["kl"]).mean(axis=1)[-1] total_loss = np.array(extra_fetches["total_loss"]).mean(axis=1)[-1] policy_loss = np.array(extra_fetches["policy_loss"]).mean(axis=1)[-1] vf_loss = np.array(extra_fetches["vf_loss"]).mean(axis=1)[-1] entropy = np.array(extra_fetches["entropy"]).mean(axis=1)[-1] newkl = self.local_evaluator.for_policy(lambda pi: pi.update_kl(kl)) info = { "kl_divergence": kl, "kl_coefficient": newkl, "total_loss": total_loss, "policy_loss": policy_loss, "vf_loss": vf_loss, "entropy": entropy, } FilterManager.synchronize( self.local_evaluator.filters, self.remote_evaluators) res = collect_metrics(self.local_evaluator, self.remote_evaluators) res = res._replace(info=info) return res
def __setstate__(self, state): self.episodes_so_far = state['episodes_so_far'] self.common.policy.set_flat_weights(state['weights']) self.common.policy.observation_filter = state["filter"] FilterManager.synchronize( {DEFAULT_POLICY_ID: self.common.policy.observation_filter}, self._workers)
def __setstate__(self, state): self.episodes_so_far = state["episodes_so_far"] self.policy.set_weights(state["weights"]) self.policy.set_filter(state["filter"]) FilterManager.synchronize({ "default": self.policy.get_filter() }, self.workers)
def __setstate__(self, state): self.episodes_so_far = state["episodes_so_far"] self.policy.set_weights(state["weights"]) self.policy.set_filter(state["filter"]) FilterManager.synchronize({ DEFAULT_POLICY_ID: self.policy.get_filter() }, self._workers)
def __setstate__(self, state): self.episodes_so_far = state["episodes_so_far"] self.policy.set_weights(state["weights"]) self.policy.set_filter(state["filter"]) FilterManager.synchronize({ "default": self.policy.get_filter() }, self.workers)
def train(self): """Overrides super.train to synchronize global vars.""" if hasattr(self, "optimizer") and isinstance(self.optimizer, PolicyOptimizer): self.global_vars["timestep"] = self.optimizer.num_steps_sampled self.optimizer.local_evaluator.set_global_vars(self.global_vars) for ev in self.optimizer.remote_evaluators: ev.set_global_vars.remote(self.global_vars) logger.debug("updated global vars: {}".format(self.global_vars)) if (self.config.get("observation_filter", "NoFilter") != "NoFilter" and hasattr(self, "local_evaluator")): FilterManager.synchronize( self.local_evaluator.filters, self.remote_evaluators, update_remote=self.config["synchronize_filters"]) logger.debug("synchronized filters: {}".format( self.local_evaluator.filters)) result = Trainable.train(self) if self.config["callbacks"].get("on_train_result"): self.config["callbacks"]["on_train_result"]({ "agent": self, "result": result, }) return result
def _train(self): def postprocess_samples(batch): # Divide by the maximum of value.std() and 1e-4 # to guard against the case where all values are equal value = batch["advantages"] standardized = (value - value.mean()) / max(1e-4, value.std()) batch.data["advantages"] = standardized batch.shuffle() dummy = np.zeros_like(batch["advantages"]) if not self.config["use_gae"]: batch.data["value_targets"] = dummy batch.data["vf_preds"] = dummy extra_fetches = self.optimizer.step(postprocess_fn=postprocess_samples) final_metrics = np.array(extra_fetches).mean(axis=1)[-1, :].tolist() total_loss, policy_loss, vf_loss, kl, entropy = final_metrics self.local_evaluator.update_kl(kl) info = { "total_loss": total_loss, "policy_loss": policy_loss, "vf_loss": vf_loss, "kl_divergence": kl, "entropy": entropy, "kl_coefficient": self.local_evaluator.kl_coeff_val, } FilterManager.synchronize(self.local_evaluator.filters, self.remote_evaluators) res = self._fetch_metrics_from_remote_evaluators() res = res._replace(info=info) return res
def _train(self): self.optimizer.step() FilterManager.synchronize(self.local_evaluator.filters, self.remote_evaluators) result = collect_metrics(self.local_evaluator, self.remote_evaluators) result = result._replace(info=self.optimizer.stats()) return result
def test_synchronize(self): """Synchronize applies filter buffer onto own filter""" filt1 = MeanStdFilter(()) for i in range(10): filt1(i) self.assertEqual(filt1.rs.n, 10) filt1.clear_buffer() self.assertEqual(filt1.buffer.n, 0) RemoteWorker = ray.remote(_MockWorker) remote_e = RemoteWorker.remote(sample_count=10) remote_e.sample.remote() FilterManager.synchronize( { "obs_filter": filt1, "rew_filter": filt1.copy() }, [remote_e]) filters = ray.get(remote_e.get_filters.remote()) obs_f = filters["obs_filter"] self.assertEqual(filt1.rs.n, 20) self.assertEqual(filt1.buffer.n, 0) self.assertEqual(obs_f.rs.n, filt1.rs.n) self.assertEqual(obs_f.buffer.n, filt1.buffer.n)
def _sync_filters_if_needed(self, workers): if self.config.get("observation_filter", "NoFilter") != "NoFilter": FilterManager.synchronize( workers.local_worker().filters, workers.remote_workers(), update_remote=self.config["synchronize_filters"]) logger.debug("synchronized filters: {}".format( workers.local_worker().filters))
def _train(self): prev_steps = self.optimizer.num_steps_sampled self.optimizer.step() FilterManager.synchronize(self.local_evaluator.filters, self.remote_evaluators) result = self.optimizer.collect_metrics() result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result
def train(self): """Overrides super.train to synchronize global vars.""" if self._has_policy_optimizer(): self.global_vars["timestep"] = self.optimizer.num_steps_sampled self.optimizer.workers.local_worker().set_global_vars( self.global_vars) for w in self.optimizer.workers.remote_workers(): w.set_global_vars.remote(self.global_vars) logger.debug("updated global vars: {}".format(self.global_vars)) result = None for _ in range(1 + MAX_WORKER_FAILURE_RETRIES): try: result = Trainable.train(self) except RayError as e: if self.config["ignore_worker_failures"]: logger.exception( "Error in train call, attempting to recover") self._try_recover() else: logger.info( "Worker crashed during call to train(). To attempt to " "continue training without the failed worker, set " "`'ignore_worker_failures': True`.") raise e except Exception as e: time.sleep(0.5) # allow logs messages to propagate raise e else: break if result is None: raise RuntimeError("Failed to recover from worker crash") if (self.config.get("observation_filter", "NoFilter") != "NoFilter" and hasattr(self, "workers") and isinstance(self.workers, WorkerSet)): FilterManager.synchronize( self.workers.local_worker().filters, self.workers.remote_workers(), update_remote=self.config["synchronize_filters"]) logger.debug("synchronized filters: {}".format( self.workers.local_worker().filters)) if self._has_policy_optimizer(): result["num_healthy_workers"] = len( self.optimizer.workers.remote_workers()) if self.config["evaluation_interval"]: if self._iteration % self.config["evaluation_interval"] == 0: evaluation_metrics = self._evaluate() assert isinstance(evaluation_metrics, dict), \ "_evaluate() needs to return a dict." result.update(evaluation_metrics) return result
def _train(self): prev_steps = self.optimizer.num_steps_sampled start = time.time() while time.time() - start < self.config["min_iter_time_s"]: self.optimizer.step() FilterManager.synchronize(self.local_evaluator.filters, self.remote_evaluators) result = self.optimizer.collect_metrics() result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result
def _train(self): prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() self.local_evaluator.for_policy(lambda pi: pi.update_kl(fetches["kl"])) FilterManager.synchronize(self.local_evaluator.filters, self.remote_evaluators) res = self.optimizer.collect_metrics() res = res._replace( timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, info=dict(fetches, **res.info)) return res
def _train(self): prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() if "kl" in fetches: # single-agent self.local_evaluator.for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: # multi-agent self.local_evaluator.foreach_trainable_policy( lambda pi, pi_id: pi.update_kl(fetches[pi_id]["kl"])) FilterManager.synchronize(self.local_evaluator.filters, self.remote_evaluators) res = self.optimizer.collect_metrics() res = res._replace( timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, info=dict(fetches, **res.info)) return res
def train(self): """Overrides super.train to synchronize global vars.""" if hasattr(self, "optimizer") and isinstance(self.optimizer, PolicyOptimizer): self.global_vars["timestep"] = self.optimizer.num_steps_sampled self.optimizer.local_evaluator.set_global_vars(self.global_vars) for ev in self.optimizer.remote_evaluators: ev.set_global_vars.remote(self.global_vars) if (self.config.get("observation_filter", "NoFilter") != "NoFilter" and hasattr(self, "local_evaluator")): FilterManager.synchronize( self.local_evaluator.filters, self.remote_evaluators, update_remote=self.config["synchronize_filters"]) return Trainable.train(self)
def train(self): """Overrides super.train to synchronize global vars.""" if self._has_policy_optimizer(): self.global_vars["timestep"] = self.optimizer.num_steps_sampled self.optimizer.local_evaluator.set_global_vars(self.global_vars) for ev in self.optimizer.remote_evaluators: ev.set_global_vars.remote(self.global_vars) logger.debug("updated global vars: {}".format(self.global_vars)) result = None for _ in range(1 + MAX_WORKER_FAILURE_RETRIES): try: result = Trainable.train(self) except RayError as e: if self.config["ignore_worker_failures"]: logger.exception( "Error in train call, attempting to recover") self._try_recover() else: logger.info( "Worker crashed during call to train(). To attempt to " "continue training without the failed worker, set " "`'ignore_worker_failures': True`.") raise e else: break if result is None: raise RuntimeError("Failed to recover from worker crash") if (self.config.get("observation_filter", "NoFilter") != "NoFilter" and hasattr(self, "local_evaluator")): FilterManager.synchronize( self.local_evaluator.filters, self.remote_evaluators, update_remote=self.config["synchronize_filters"]) logger.debug("synchronized filters: {}".format( self.local_evaluator.filters)) if self._has_policy_optimizer(): result["num_healthy_workers"] = len( self.optimizer.remote_evaluators) return result
def testSynchronize(self): """Synchronize applies filter buffer onto own filter""" filt1 = MeanStdFilter(()) for i in range(10): filt1(i) self.assertEqual(filt1.rs.n, 10) filt1.clear_buffer() self.assertEqual(filt1.buffer.n, 0) RemoteEvaluator = ray.remote(_MockEvaluator) remote_e = RemoteEvaluator.remote(sample_count=10) remote_e.sample.remote() FilterManager.synchronize({ "obs_filter": filt1, "rew_filter": filt1.copy() }, [remote_e]) filters = ray.get(remote_e.get_filters.remote()) obs_f = filters["obs_filter"] self.assertEqual(filt1.rs.n, 20) self.assertEqual(filt1.buffer.n, 0) self.assertEqual(obs_f.rs.n, filt1.rs.n) self.assertEqual(obs_f.buffer.n, filt1.buffer.n)
def _train(self): num_pairs = math.ceil(self.candidates_per_iteration / 2) candidates: [any ] = [((0, 0), None, None)] * self.num_evals_per_iteration for _ in range(num_pairs): offset = randint(0, sys.maxsize) candidates.append(((offset, 1), None, None)) candidates.append(((offset, -1), None, None)) num_candidates = len(candidates) dispatch_index: int = 0 async def manage_worker(worker): nonlocal candidates, dispatch_index, result while dispatch_index < num_candidates: index = dispatch_index dispatch_index += 1 candidate = candidates[index][0] score, length = await worker.evaluate.remote(candidate) candidates[index] = (candidate, score, length) async def run_workers(): tasks = [ asyncio.create_task(manage_worker(worker)) for worker in self._workers ] for task in tasks: await task print('run workers begin') loop = asyncio.new_event_loop() loop.run_until_complete(run_workers()) print('run workers complete') # synchronize on evaluations completed theta_evals = candidates[0:self.num_evals_per_iteration] candidate_evals = candidates[self.num_evals_per_iteration:] # update all workers update_completions = [ worker.update.remote(candidate_evals) for worker in self._workers ] info = self.common.model_keeper.update(candidate_evals) self.common.policy.set_flat_weights(self.common.model_keeper.theta) # synchronize on updates completed for completion in update_completions: ray.get(completion) print('update workers complete') episodes_this_iteration = len(candidate_evals) self.episodes_so_far += episodes_this_iteration # Now sync the filters FilterManager.synchronize( {DEFAULT_POLICY_ID: self.common.policy.observation_filter}, self._workers) def extract_columns(evaluations): rewards = np.fromiter( (evaluation[1] for evaluation in evaluations), dtype=np.float32) lengths = np.fromiter( (evaluation[2] for evaluation in evaluations), dtype=np.int) return rewards, lengths theta_rewards, theta_lengths = extract_columns(theta_evals) candidate_rewards, candidate_lengths = extract_columns(candidate_evals) def impute(func, a, min_size=1): return func(a) if a.size >= min_size else math.nan def accumulate_distribution_stats(name, a): nonlocal info info[name + 'mean'] = impute(np.mean, a) info[name + 'stdev'] = impute(np.std, a, 2) info[name + 'min'] = impute(np.min, a) info[name + 'max'] = impute(np.max, a) accumulate_distribution_stats('candidate_reward_', candidate_rewards) accumulate_distribution_stats('candidate_length_', candidate_lengths) accumulate_distribution_stats('best_reward_', theta_rewards) accumulate_distribution_stats('best_length_', theta_lengths) info['episodes_this_iter'] = episodes_this_iteration info['episodes_so_far'] = self.episodes_so_far result = { 'episode_reward_mean': info['best_reward_mean'], 'episode_len_mean': info['best_length_mean'], 'timesteps_this_iter': np.sum(candidate_lengths), 'episode_reward_max': info['best_reward_max'], 'episode_reward_min': info['best_reward_min'], 'episodes_this_iter': episodes_this_iteration, 'episodes_total': self.episodes_so_far, 'info': info, } return result
def _train(self): config = self.config theta = self.theta_dict[self.curr_parent] #print('theta shape is {}'.format(np.array(theta).shape)) self.policy.set_weights(theta) assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps, self.policymax, self.rewmax = self._collect_results( theta_id, config["episodes_per_batch"], config["train_batch_size"]) all_noise_indices = [] all_training_returns = [] all_training_lengths = [] all_eval_returns = [] all_eval_lengths = [] all_training_acc_returns = [] all_eval_acc_returns = [] all_policy_weight = [] all_novelty = [] all_rew_chgs = [] all_entro_chgs = [] all_distances = [] # Loop over the results. for result in results: all_eval_returns += result.eval_returns all_eval_lengths += result.eval_lengths all_eval_acc_returns += result.eval_acc_returns all_noise_indices += result.noise_indices all_training_returns += result.noisy_returns all_training_lengths += result.noisy_lengths all_training_acc_returns += result.noisy_acc_returns all_policy_weight += result.policy_weights all_novelty += result.novelty assert len(all_eval_returns) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_returns) == len(all_training_lengths)) self.episodes_so_far += num_episodes # Assemble the results. eval_returns = np.array(all_eval_returns) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_returns = np.array(all_training_returns) noisy_lengths = np.array(all_training_lengths) novelty_entropy = np.array(all_novelty) eval_acc_returns = np.array(all_eval_acc_returns) noisy_acc_returns = np.array(all_training_acc_returns) # Process the returns. # Compute and take a step. #print('enter1') population = self.population returns_n2 = self.returns_n2 ret = self.ret population.extend(all_policy_weight) #returns_n2.extend(all_training_returns) returns_n2.extend(all_training_acc_returns) ret.extend(all_training_returns) population.extend(self.policymax) returns_n2.extend(self.maxrew) ret.extend(self.maxrew2) population2 = np.array(population) returns2_n2 = np.array(returns_n2) ret2 = np.array(ret) returns2_n2, indices = np.unique(returns2_n2, return_index=True) #population2=population2[indices] ret2, ind = np.unique(ret2, return_index=True) population2 = population2[ind] #print('enter2') #print('population shape is {}'.format(population2.shape)) print('returns_n2 is {}'.format(returns_n2)) print('ret2 is {}'.format(ret2)) print('returns2_n2 shape is {}'.format(returns2_n2.shape)) if population2.shape[0] >= config["population_size"]: if len(returns2_n2.tolist()) >= config["population_size"]: idx = np.argpartition( returns2_n2, (-config["population_size"], -1))[-1:-config["population_size"] - 1:-1] #population2 = population2[idx] returns2_n2 = returns2_n2[idx] if len(ret2.tolist()) >= config["population_size"]: idx2 = np.argpartition( ret2, (-config["population_size"], -1))[-1:-config["population_size"] - 1:-1] ret2 = ret2[idx2] population2 = population2[idx2] #print('enter3') theta = population2[0][0] #print('shape of theta is {}'.format(theta.shape)) self.population = population2.tolist() self.returns_n2 = returns2_n2.tolist() self.ret = ret2.tolist() print("returns_n2 is {}".format(self.returns_n2)) print("ret2 is {}".format(ret2)) g = -1000 update_ratio = -1000 # Set the new weights in the local copy of the policy. self.policy.set_weights(theta) self.theta_dict[self.curr_parent] = self.policy.get_weights() # Store the rewards if len(all_eval_returns) > 0: if self.curr_parent == 0: self.reward_list1.append(np.mean(eval_returns)) if self.curr_parent == 1: self.reward_list2.append(np.mean(eval_returns)) if self.curr_parent == 2: self.reward_list3.append(np.mean(eval_returns)) # Now sync the filters FilterManager.synchronize({"default": self.policy.get_filter()}, self.workers) info = { "weights_norm": np.square(theta).sum(), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": noisy_lengths.size, "episodes_so_far": self.episodes_so_far, } #self.iteration if self.curr_parent == 0: self.reward_mean1 = np.mean( self.reward_list1[-self.report_length:]) # if len(self.reward_list1)>=self.report_length: # reward_max1=np.max(self.reward_list1[-self.report_length:]) # reward_min1=np.min(self.reward_list1[-self.report_length:]) if self.curr_parent == 1: self.reward_mean2 = np.mean( self.reward_list2[-self.report_length:]) # if len(self.reward_list2)>=self.report_length: # reward_max2=np.max(self.reward_list2[-self.report_length:]) # reward_min2=np.min(self.reward_list2[-self.report_length:]) if self.curr_parent == 2: self.reward_mean3 = np.mean( self.reward_list3[-self.report_length:]) # if len(self.reward_list3)>=self.report_length: # reward_max3=np.max(self.reward_list3[-self.report_length:]) # reward_min3=np.min(self.reward_list3[-self.report_length:]) # reward_mean_noise=np.mean(all_training_returns) # reward_max_noise=np.max(all_training_returns) # reward_min_noise=np.min(all_training_returns) result = dict( #episode_reward_min1=reward_min1, episode_reward_mean1=self.reward_mean1, #episode_reward_max1=reward_max1, #episode_reward_min2=reward_min2, episode_reward_mean2=self.reward_mean2, #episode_reward_max2=reward_max2, #episode_reward_min3=reward_min3, episode_reward_mean3=self.reward_mean3, #episode_reward_max3=reward_max3, # noise_reward_min=reward_min_noise, # noise_reward_mean=reward_mean_noise, # noise_reward_max=reward_max_noise, episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) self.curr_parent = (self.curr_parent + 1) % config["pop_size"] return result
def _train(self): self.optimizer.step() FilterManager.synchronize( self.local_evaluator.filters, self.remote_evaluators) return collect_metrics(self.local_evaluator, self.remote_evaluators)
def train(self): agents = self.remote_evaluators config = self.config model = self.local_evaluator if (config["num_workers"] * config["min_steps_per_task"] > config["timesteps_per_batch"]): print( "WARNING: num_workers * min_steps_per_task > " "timesteps_per_batch. This means that the output of some " "tasks will be wasted. Consider decreasing " "min_steps_per_task or increasing timesteps_per_batch.") while self.global_step < self.config['num_batches']: iter_start = time.time() weights = ray.put(model.get_weights()) [a.set_weights.remote(weights) for a in agents] samples = collect_samples(agents, config, self.local_evaluator) def standardized(value): # Divide by the maximum of value.std() and 1e-4 # to guard against the case where all values are equal return (value - value.mean()) / max(1e-4, value.std()) samples.data["advantages"] = standardized(samples["advantages"]) rollouts_end = time.time() print("Computing policy (iterations=" + str(config["num_sgd_iter"]) + ", stepsize=" + str(config["sgd_stepsize"]) + "):") samples.shuffle() shuffle_end = time.time() # tuples_per_device = model.load_data( # samples, self.iteration == 0 and config["full_trace_data_load"]) tuples_per_device = model.load_data( samples, config["full_trace_data_load"]) load_end = time.time() rollouts_time = rollouts_end - iter_start shuffle_time = shuffle_end - rollouts_end load_time = load_end - shuffle_end sgd_time = 0 for i in range(config["num_sgd_iter"]): sgd_start = time.time() batch_index = 0 num_batches = ( int(tuples_per_device) // int(model.per_device_batch_size)) permutation = np.random.permutation(num_batches) while batch_index < num_batches: model.run_sgd_minibatch(permutation[batch_index] * model.per_device_batch_size, self.kl_coeff, False, self.file_writer) batch_index += 1 sgd_end = time.time() sgd_time += sgd_end - sgd_start self.global_step += 1 # if kl > 2.0 * config["kl_target"]: # self.kl_coeff *= 1.5 # elif kl < 0.5 * config["kl_target"]: # self.kl_coeff *= 0.5 FilterManager.synchronize( self.local_evaluator.filters, self.remote_evaluators) info = { # "kl_divergence": kl, # "kl_coefficient": self.kl_coeff, "rollouts_time": rollouts_time, "shuffle_time": shuffle_time, "load_time": load_time, "sgd_time": sgd_time, "sample_throughput": len(samples["observations"]) / sgd_time } print(info) if self.global_step % self.config['batches_per_save'] == 0: self._save() if self.global_step % self.config['batches_per_evaluate'] == 0: pl, vl, ent, kl, e_pl, e_vl, e_ent, e_kl, rew, leng = \ self.local_evaluator.get_evaluate_metrics() stats = tf.Summary(value=[ tf.Summary.Value(tag="reward", simple_value=rew), tf.Summary.Value(tag="episode_length", simple_value=leng), tf.Summary.Value(tag="policy_loss", simple_value=pl), tf.Summary.Value(tag="value_loss", simple_value=vl), tf.Summary.Value(tag="entropy", simple_value=ent), tf.Summary.Value(tag="kl", simple_value=kl), tf.Summary.Value(tag="e_policy_loss", simple_value=e_pl), tf.Summary.Value(tag="e_value_loss", simple_value=e_vl), tf.Summary.Value(tag="e_entropy", simple_value=e_ent), tf.Summary.Value(tag="e_kl", simple_value=e_kl), ],) self.file_writer.add_summary(stats, self.global_step)
def _train(self): config = self.config # Here the iteration starts # Create the random environments configurations for each iteration logger.info("Creating random environment configurations") # ceil(config["num_rollouts"]/config["num_workers"]*2)*config["num_workers"] # is the exact number of environments created per iteration as the iteration is incremently # increase by 2 (one for positive and one for negative perturbation) for each worker # For each positive and negative perturbation, the same environment # But if we do this it will be corrolated with number of rollouts and we will have less number of rollouts with different perturbations random_env_config_id = create_random_env_configs.remote(self.extra_config["num_randomized_envs"], self.domain_randomization_config) self.random_env_config = SharedRandomEnvConfigsTable(ray.get(random_env_config_id)) for worker in self.workers: worker.setRandomEnvConfig.remote(random_env_config_id) theta = self.policy.get_weights() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( theta_id, config["num_rollouts"]) all_noise_indices = [] all_training_returns = [] all_training_lengths = [] all_eval_returns = [] all_eval_lengths = [] # Loop over the results. for result in results: all_eval_returns += result.eval_returns all_eval_lengths += result.eval_lengths all_noise_indices += result.noise_indices all_training_returns += result.noisy_returns all_training_lengths += result.noisy_lengths assert len(all_eval_returns) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_returns) == len(all_training_lengths)) self.episodes_so_far += num_episodes # Assemble the results. eval_returns = np.array(all_eval_returns) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_returns = np.array(all_training_returns) noisy_lengths = np.array(all_training_lengths) # keep only the best returns # select top performing directions if rollouts_used < num_rollouts max_rewards = np.max(noisy_returns, axis=1) if self.rollouts_used > self.num_rollouts: self.rollouts_used = self.num_rollouts percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts)) idx = np.arange(max_rewards.size)[ max_rewards >= np.percentile(max_rewards, percentile)] noise_idx = noise_indices[idx] noisy_returns = noisy_returns[idx, :] # Compute and take a step. It means take a step in changing the theta not take an action g, count = utils.batched_weighted_sum( noisy_returns[:, 0] - noisy_returns[:, 1], (self.noise.get(index, self.policy.num_params) for index in noise_idx), batch_size=min(500, noisy_returns[:, 0].size)) g /= noise_idx.size # scale the returns by their standard deviation if not np.isclose(np.std(noisy_returns), 0.0): g /= np.std(noisy_returns) assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32) # Compute the new weights theta. theta, update_ratio = self.optimizer.update(-g) # Set the new weights in the local copy of the policy. self.policy.set_weights(theta) # update the reward list if len(all_eval_returns) > 0: self.reward_list.append(eval_returns.mean()) # Now sync the filters FilterManager.synchronize({ DEFAULT_POLICY_ID: self.policy.get_filter() }, self.workers) info = { "weights_norm": np.square(theta).sum(), "weights_std": np.std(theta), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": noisy_lengths.size, "episodes_so_far": self.episodes_so_far, } result = dict( episode_reward_mean=np.mean( self.reward_list[-self.report_length:]), episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) return result
def _train(self): self.optimizer.step() FilterManager.synchronize(self.local_evaluator.filters, self.remote_evaluators) res = self._fetch_metrics_from_remote_evaluators() return res
def _train(self): agents = self.remote_evaluators config = self.config model = self.local_evaluator if (config["num_workers"] * config["min_steps_per_task"] > config["timesteps_per_batch"]): print( "WARNING: num_workers * min_steps_per_task > " "timesteps_per_batch. This means that the output of some " "tasks will be wasted. Consider decreasing " "min_steps_per_task or increasing timesteps_per_batch.") print("===> iteration", self.iteration) iter_start = time.time() weights = ray.put(model.get_weights()) [a.set_weights.remote(weights) for a in agents] samples = collect_samples(agents, config, self.local_evaluator) def standardized(value): # Divide by the maximum of value.std() and 1e-4 # to guard against the case where all values are equal return (value - value.mean()) / max(1e-4, value.std()) samples.data["advantages"] = standardized(samples["advantages"]) rollouts_end = time.time() print("Computing policy (iterations=" + str(config["num_sgd_iter"]) + ", stepsize=" + str(config["sgd_stepsize"]) + "):") names = [ "iter", "total loss", "policy loss", "vf loss", "kl", "entropy"] print(("{:>15}" * len(names)).format(*names)) samples.shuffle() shuffle_end = time.time() tuples_per_device = model.load_data( samples, self.iteration == 0 and config["full_trace_data_load"]) load_end = time.time() rollouts_time = rollouts_end - iter_start shuffle_time = shuffle_end - rollouts_end load_time = load_end - shuffle_end sgd_time = 0 for i in range(config["num_sgd_iter"]): sgd_start = time.time() batch_index = 0 num_batches = ( int(tuples_per_device) // int(model.per_device_batch_size)) loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], [] permutation = np.random.permutation(num_batches) # Prepare to drop into the debugger if self.iteration == config["tf_debug_iteration"]: model.sess = tf_debug.LocalCLIDebugWrapperSession(model.sess) while batch_index < num_batches: full_trace = ( i == 0 and self.iteration == 0 and batch_index == config["full_trace_nth_sgd_batch"]) batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \ batch_entropy = model.run_sgd_minibatch( permutation[batch_index] * model.per_device_batch_size, self.kl_coeff, full_trace, self.file_writer) loss.append(batch_loss) policy_loss.append(batch_policy_loss) vf_loss.append(batch_vf_loss) kl.append(batch_kl) entropy.append(batch_entropy) batch_index += 1 loss = np.mean(loss) policy_loss = np.mean(policy_loss) vf_loss = np.mean(vf_loss) kl = np.mean(kl) entropy = np.mean(entropy) sgd_end = time.time() print( "{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format( i, loss, policy_loss, vf_loss, kl, entropy)) values = [] if i == config["num_sgd_iter"] - 1: metric_prefix = "ppo/sgd/final_iter/" values.append(tf.Summary.Value( tag=metric_prefix + "kl_coeff", simple_value=self.kl_coeff)) values.extend([ tf.Summary.Value( tag=metric_prefix + "mean_entropy", simple_value=entropy), tf.Summary.Value( tag=metric_prefix + "mean_loss", simple_value=loss), tf.Summary.Value( tag=metric_prefix + "mean_kl", simple_value=kl)]) if self.file_writer: sgd_stats = tf.Summary(value=values) self.file_writer.add_summary(sgd_stats, self.global_step) self.global_step += 1 sgd_time += sgd_end - sgd_start if kl > 2.0 * config["kl_target"]: self.kl_coeff *= 1.5 elif kl < 0.5 * config["kl_target"]: self.kl_coeff *= 0.5 info = { "kl_divergence": kl, "kl_coefficient": self.kl_coeff, "rollouts_time": rollouts_time, "shuffle_time": shuffle_time, "load_time": load_time, "sgd_time": sgd_time, "sample_throughput": len(samples["observations"]) / sgd_time } FilterManager.synchronize( self.local_evaluator.filters, self.remote_evaluators) res = self._fetch_metrics_from_remote_evaluators() res = res._replace(info=info) return res
def _train(self): agents = self.remote_evaluators config = self.config model = self.local_evaluator print("===> iteration", self.iteration) iter_start = time.time() weights = ray.put(model.get_weights()) [a.set_weights.remote(weights) for a in agents] samples = collect_samples(agents, config, self.local_evaluator) def standardized(value): # Divide by the maximum of value.std() and 1e-4 # to guard against the case where all values are equal return (value - value.mean()) / max(1e-4, value.std()) samples.data["advantages"] = standardized(samples["advantages"]) rollouts_end = time.time() print("Computing policy (iterations=" + str(config["num_sgd_iter"]) + ", stepsize=" + str(config["sgd_stepsize"]) + "):") names = [ "iter", "total loss", "policy loss", "vf loss", "kl", "entropy" ] print(("{:>15}" * len(names)).format(*names)) samples.shuffle() shuffle_end = time.time() tuples_per_device = model.load_data( samples, self.iteration == 0 and config["full_trace_data_load"]) load_end = time.time() rollouts_time = rollouts_end - iter_start shuffle_time = shuffle_end - rollouts_end load_time = load_end - shuffle_end sgd_time = 0 for i in range(config["num_sgd_iter"]): sgd_start = time.time() batch_index = 0 num_batches = (int(tuples_per_device) // int(model.per_device_batch_size)) loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], [] permutation = np.random.permutation(num_batches) # Prepare to drop into the debugger if self.iteration == config["tf_debug_iteration"]: model.sess = tf_debug.LocalCLIDebugWrapperSession(model.sess) while batch_index < num_batches: full_trace = (i == 0 and self.iteration == 0 and batch_index == config["full_trace_nth_sgd_batch"]) batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \ batch_entropy = model.run_sgd_minibatch( permutation[batch_index] * model.per_device_batch_size, self.kl_coeff, full_trace, self.file_writer) loss.append(batch_loss) policy_loss.append(batch_policy_loss) vf_loss.append(batch_vf_loss) kl.append(batch_kl) entropy.append(batch_entropy) batch_index += 1 loss = np.mean(loss) policy_loss = np.mean(policy_loss) vf_loss = np.mean(vf_loss) kl = np.mean(kl) entropy = np.mean(entropy) sgd_end = time.time() print("{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format( i, loss, policy_loss, vf_loss, kl, entropy)) values = [] if i == config["num_sgd_iter"] - 1: metric_prefix = "ppo/sgd/final_iter/" values.append( tf.Summary.Value(tag=metric_prefix + "kl_coeff", simple_value=self.kl_coeff)) values.extend([ tf.Summary.Value(tag=metric_prefix + "mean_entropy", simple_value=entropy), tf.Summary.Value(tag=metric_prefix + "mean_loss", simple_value=loss), tf.Summary.Value(tag=metric_prefix + "mean_kl", simple_value=kl) ]) if self.file_writer: sgd_stats = tf.Summary(value=values) self.file_writer.add_summary(sgd_stats, self.global_step) self.global_step += 1 sgd_time += sgd_end - sgd_start if kl > 2.0 * config["kl_target"]: self.kl_coeff *= 1.5 elif kl < 0.5 * config["kl_target"]: self.kl_coeff *= 0.5 info = { "kl_divergence": kl, "kl_coefficient": self.kl_coeff, "rollouts_time": rollouts_time, "shuffle_time": shuffle_time, "load_time": load_time, "sgd_time": sgd_time, "sample_throughput": len(samples["observations"]) / sgd_time } FilterManager.synchronize(self.local_evaluator.filters, self.remote_evaluators) res = self._fetch_metrics_from_remote_evaluators() res = res._replace(info=info) return res
def _train(self): config = self.config theta = self.policy.get_weights() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( theta_id, config["num_rollouts"]) all_noise_indices = [] all_training_returns = [] all_training_lengths = [] all_eval_returns = [] all_eval_lengths = [] # Loop over the results. for result in results: all_eval_returns += result.eval_returns all_eval_lengths += result.eval_lengths all_noise_indices += result.noise_indices all_training_returns += result.noisy_returns all_training_lengths += result.noisy_lengths assert len(all_eval_returns) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_returns) == len(all_training_lengths)) self.episodes_so_far += num_episodes # Assemble the results. eval_returns = np.array(all_eval_returns) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_returns = np.array(all_training_returns) noisy_lengths = np.array(all_training_lengths) # keep only the best returns # select top performing directions if rollouts_used < num_rollouts max_rewards = np.max(noisy_returns, axis=1) if self.rollouts_used > self.num_rollouts: self.rollouts_used = self.num_rollouts percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts)) idx = np.arange(max_rewards.size)[ max_rewards >= np.percentile(max_rewards, percentile)] noise_idx = noise_indices[idx] noisy_returns = noisy_returns[idx, :] # Compute and take a step. g, count = utils.batched_weighted_sum( noisy_returns[:, 0] - noisy_returns[:, 1], (self.noise.get(index, self.policy.num_params) for index in noise_idx), batch_size=min(500, noisy_returns[:, 0].size)) g /= noise_idx.size # scale the returns by their standard deviation if not np.isclose(np.std(noisy_returns), 0.0): g /= np.std(noisy_returns) assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32) # Compute the new weights theta. theta, update_ratio = self.optimizer.update(-g) # Set the new weights in the local copy of the policy. self.policy.set_weights(theta) # update the reward list if len(all_eval_returns) > 0: self.reward_list.append(eval_returns.mean()) # Now sync the filters FilterManager.synchronize({ "default": self.policy.get_filter() }, self.workers) info = { "weights_norm": np.square(theta).sum(), "weights_std": np.std(theta), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": noisy_lengths.size, "episodes_so_far": self.episodes_so_far, } result = dict( episode_reward_mean=np.mean( self.reward_list[-self.report_length:]), episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) return result
def _train(self): config = self.config theta = self.policy.get_weights() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( theta_id, config["episodes_per_batch"], config["train_batch_size"]) all_noise_indices = [] all_training_returns = [] all_training_lengths = [] all_eval_returns = [] all_eval_lengths = [] # Loop over the results. for result in results: all_eval_returns += result.eval_returns all_eval_lengths += result.eval_lengths all_noise_indices += result.noise_indices all_training_returns += result.noisy_returns all_training_lengths += result.noisy_lengths assert len(all_eval_returns) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_returns) == len(all_training_lengths)) self.episodes_so_far += num_episodes # Assemble the results. eval_returns = np.array(all_eval_returns) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_returns = np.array(all_training_returns) noisy_lengths = np.array(all_training_lengths) # Process the returns. if config["return_proc_mode"] == "centered_rank": proc_noisy_returns = utils.compute_centered_ranks(noisy_returns) else: raise NotImplementedError(config["return_proc_mode"]) # Compute and take a step. g, count = utils.batched_weighted_sum( proc_noisy_returns[:, 0] - proc_noisy_returns[:, 1], (self.noise.get(index, self.policy.num_params) for index in noise_indices), batch_size=500) g /= noisy_returns.size assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32 and count == len(noise_indices)) # Compute the new weights theta. theta, update_ratio = self.optimizer.update(-g + config["l2_coeff"] * theta) # Set the new weights in the local copy of the policy. self.policy.set_weights(theta) # Store the rewards if len(all_eval_returns) > 0: self.reward_list.append(np.mean(eval_returns)) # Now sync the filters FilterManager.synchronize({ "default": self.policy.get_filter() }, self.workers) info = { "weights_norm": np.square(theta).sum(), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": noisy_lengths.size, "episodes_so_far": self.episodes_so_far, } reward_mean = np.mean(self.reward_list[-self.report_length:]) result = dict( episode_reward_mean=reward_mean, episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) return result
def _train(self): optimizer: CoordinatedOptimizer = self.common.optimizer candidates = optimizer.ask() best_message = optimizer.best() for i in range(self.num_evals_per_iteration): candidates.append(best_message) num_candidates = len(candidates) dispatch_index: int = num_candidates - 1 async def manage_worker(worker): nonlocal candidates, dispatch_index, result while dispatch_index >= 0: index = dispatch_index dispatch_index -= 1 candidate = candidates[index] score, length = await worker.evaluate.remote(candidate) candidates[index] = (score, candidate, length) async def run_workers(): workers = self._workers num_workers = len(workers) tasks = [ asyncio.create_task(manage_worker(workers[i % num_workers])) for i in range(self.request_interleaving * num_workers) ] for task in tasks: # synchronize on evaluations completed await task loop = asyncio.new_event_loop() loop.run_until_complete(run_workers()) theta_evals = candidates[-self.num_evals_per_iteration:] candidate_evals = candidates[0:-self.num_evals_per_iteration] # update all workers update_completions = [ worker.tell.remote(candidate_evals) for worker in self._workers ] optimizer.tell_messages(candidate_evals) self.common.policy.set_flat_weights(optimizer.best_candidate()) # synchronize on updates completed for completion in update_completions: ray.get(completion) episodes_this_iteration = len(candidate_evals) self.episodes_so_far += episodes_this_iteration # Now sync the filters FilterManager.synchronize( {DEFAULT_POLICY_ID: self.common.policy.observation_filter}, self._workers) # FilterManager.synchronize({ # DEFAULT_POLICY_ID: self.common.policy.get_filter() # }, self._workers) info = optimizer.status() def extract_columns(evaluations): rewards = np.fromiter( (evaluation[0] for evaluation in evaluations), dtype=np.float32) lengths = np.fromiter( (evaluation[2] for evaluation in evaluations), dtype=np.int) return rewards, lengths theta_rewards, theta_lengths = extract_columns(theta_evals) candidate_rewards, candidate_lengths = extract_columns(candidate_evals) def impute(func, a, min_size=1): return func(a) if a.size >= min_size else math.nan def accumulate_distribution_stats(name, a): nonlocal info info[name + 'mean'] = impute(np.mean, a) info[name + 'stdev'] = impute(np.std, a, 2) info[name + 'min'] = impute(np.min, a) info[name + 'max'] = impute(np.max, a) accumulate_distribution_stats('candidate_reward_', candidate_rewards) accumulate_distribution_stats('candidate_length_', candidate_lengths) accumulate_distribution_stats('best_reward_', theta_rewards) accumulate_distribution_stats('best_length_', theta_lengths) info['episodes_this_iter'] = episodes_this_iteration info['episodes_so_far'] = self.episodes_so_far result = { 'episode_reward_mean': info['best_reward_mean'], 'episode_len_mean': info['best_length_mean'], 'timesteps_this_iter': np.sum(candidate_lengths), 'episode_reward_max': info['best_reward_max'], 'episode_reward_min': info['best_reward_min'], 'episodes_this_iter': episodes_this_iteration, 'episodes_total': self.episodes_so_far, 'info': info, } return result
def step(self): config = self.config theta = self.policy.get_flat_weights() assert theta.dtype == np.float32 assert len(theta.shape) == 1 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( theta_id, config["num_rollouts"]) all_noise_indices = [] all_training_returns = [] all_training_lengths = [] all_eval_returns = [] all_eval_lengths = [] # Loop over the results. for result in results: all_eval_returns += result.eval_returns all_eval_lengths += result.eval_lengths all_noise_indices += result.noise_indices all_training_returns += result.noisy_returns all_training_lengths += result.noisy_lengths assert len(all_eval_returns) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_returns) == len(all_training_lengths)) self.episodes_so_far += num_episodes # Assemble the results. eval_returns = np.array(all_eval_returns) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_returns = np.array(all_training_returns) noisy_lengths = np.array(all_training_lengths) # keep only the best returns # select top performing directions if rollouts_used < num_rollouts max_rewards = np.max(noisy_returns, axis=1) if self.rollouts_used > self.num_rollouts: self.rollouts_used = self.num_rollouts percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts)) idx = np.arange(max_rewards.size)[ max_rewards >= np.percentile(max_rewards, percentile)] noise_idx = noise_indices[idx] noisy_returns = noisy_returns[idx, :] # Compute and take a step. g, count = utils.batched_weighted_sum( noisy_returns[:, 0] - noisy_returns[:, 1], (self.noise.get(index, self.policy.num_params) for index in noise_idx), batch_size=min(500, noisy_returns[:, 0].size)) g /= noise_idx.size # scale the returns by their standard deviation if not np.isclose(np.std(noisy_returns), 0.0): g /= np.std(noisy_returns) assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32) # Compute the new weights theta. theta, update_ratio = self.optimizer.update(-g) # Set the new weights in the local copy of the policy. self.policy.set_flat_weights(theta) # update the reward list if len(all_eval_returns) > 0: self.reward_list.append(eval_returns.mean()) # Now sync the filters FilterManager.synchronize( {DEFAULT_POLICY_ID: self.policy.observation_filter}, self.workers) info = { "weights_norm": np.square(theta).sum(), "weights_std": np.std(theta), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": noisy_lengths.size, "episodes_so_far": self.episodes_so_far, } result = dict(episode_reward_mean=np.mean( self.reward_list[-self.report_length:]), episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) return result
def step_attempt(self): config = self.config theta = self.policy.get_flat_weights() assert theta.dtype == np.float32 assert len(theta.shape) == 1 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( theta_id, config["episodes_per_batch"], config["train_batch_size"]) all_noise_indices = [] all_training_returns = [] all_training_lengths = [] all_eval_returns = [] all_eval_lengths = [] # Loop over the results. for result in results: all_eval_returns += result.eval_returns all_eval_lengths += result.eval_lengths all_noise_indices += result.noise_indices all_training_returns += result.noisy_returns all_training_lengths += result.noisy_lengths assert len(all_eval_returns) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_returns) == len(all_training_lengths)) self.episodes_so_far += num_episodes # Assemble the results. eval_returns = np.array(all_eval_returns) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_returns = np.array(all_training_returns) noisy_lengths = np.array(all_training_lengths) # Process the returns. if config["return_proc_mode"] == "centered_rank": proc_noisy_returns = utils.compute_centered_ranks(noisy_returns) else: raise NotImplementedError(config["return_proc_mode"]) # Compute and take a step. g, count = utils.batched_weighted_sum( proc_noisy_returns[:, 0] - proc_noisy_returns[:, 1], (self.noise.get(index, self.policy.num_params) for index in noise_indices), batch_size=500) g /= noisy_returns.size assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32 and count == len(noise_indices)) # Compute the new weights theta. theta, update_ratio = self.optimizer.update(-g + config["l2_coeff"] * theta) # Set the new weights in the local copy of the policy. self.policy.set_flat_weights(theta) # Store the rewards if len(all_eval_returns) > 0: self.reward_list.append(np.mean(eval_returns)) # Now sync the filters FilterManager.synchronize( {DEFAULT_POLICY_ID: self.policy.observation_filter}, self._workers) info = { "weights_norm": np.square(theta).sum(), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": noisy_lengths.size, "episodes_so_far": self.episodes_so_far, } reward_mean = np.mean(self.reward_list[-self.report_length:]) result = dict(episode_reward_mean=reward_mean, episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) return result
def _train(self): self.optimizer.step() FilterManager.synchronize( self.local_evaluator.filters, self.remote_evaluators) res = self._fetch_metrics_from_remote_evaluators() return res