def maybe_download(dataset_root): if not dataset_root or not os.path.exists( os.path.join(dataset_root, "AgentHuman")): screen.log_title( "Downloading the CARLA dataset. This might take a while.") google_drive_download_id = "1hloAeyamYn-H6MfV1dRtY1gJPhkR55sY" filename_to_save = "datasets/CORL2017ImitationLearningData.tar.gz" download_command = 'wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=' \ '$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies ' \ '--no-check-certificate \"https://docs.google.com/uc?export=download&id={}\" -O- | ' \ 'sed -rn \'s/.*confirm=([0-9A-Za-z_]+).*/\\1\\n/p\')&id={}" -O {} && rm -rf /tmp/cookies.txt'\ .format(google_drive_download_id, google_drive_download_id, filename_to_save) # start downloading and wait for it to finish start_shell_command_and_wait(download_command) screen.log_title("Unzipping the dataset") unzip_command = 'tar -xzf {} --checkpoint=.10000'.format( filename_to_save) if dataset_root is not None: unzip_command += " -C {}".format(dataset_root) if not os.path.exists(dataset_root): os.makedirs(dataset_root) start_shell_command_and_wait(unzip_command)
def restore_checkpoint(self): self.verify_graph_was_created() # TODO: find better way to load checkpoints that were saved with a global network into the online network if self.task_parameters.checkpoint_restore_dir: if self.task_parameters.framework_type == Frameworks.tensorflow and\ 'checkpoint' in os.listdir(self.task_parameters.checkpoint_restore_dir): # TODO-fixme checkpointing # MonitoredTrainingSession manages save/restore checkpoints autonomously. Doing so, # it creates it own names for the saved checkpoints, which do not match the "{}_Step-{}.ckpt" filename # pattern. The names used are maintained in a CheckpointState protobuf file named 'checkpoint'. Using # Coach's '.coach_checkpoint' protobuf file, results in an error when trying to restore the model, as # the checkpoint names defined do not match the actual checkpoint names. checkpoint = self._get_checkpoint_state_tf() else: checkpoint = get_checkpoint_state( self.task_parameters.checkpoint_restore_dir) if checkpoint is None: screen.warning("No checkpoint to restore in: {}".format( self.task_parameters.checkpoint_restore_dir)) else: screen.log_title("Loading checkpoint: {}".format( checkpoint.model_checkpoint_path)) self.checkpoint_saver.restore(self.sess, checkpoint.model_checkpoint_path) [ manager.restore_checkpoint( self.task_parameters.checkpoint_restore_dir) for manager in self.level_managers ]
def improve(self): """ The main loop of the run. Defined in the following steps: 1. Heatup 2. Repeat: 2.1. Repeat: 2.1.1. Act 2.1.2. Train 2.1.3. Possibly save checkpoint 2.2. Evaluate :return: None """ self.verify_graph_was_created() # initialize the network parameters from the global network self.sync() # heatup self.heatup(self.heatup_steps) # improve if self.task_parameters.task_index is not None: screen.log_title("Starting to improve {} task index {}".format( self.name, self.task_parameters.task_index)) else: screen.log_title("Starting to improve {}".format(self.name)) count_end = self.total_steps_counters[ RunPhase.TRAIN] + self.improve_steps while self.total_steps_counters[RunPhase.TRAIN] < count_end: self.train_and_act(self.steps_between_evaluation_periods) if self.evaluate(self.evaluation_steps): break
def improve_reward_model(self): """ :return: """ screen.log_title("Training a regression model for estimating MDP rewards") self.level_managers[0].agents['agent'].improve_reward_model(epochs=self.reward_model_num_epochs)
def run_trace_based_test(preset_name, num_env_steps, level=None): test_name = '__test_trace_{}{}'.format(preset_name, '_' + level if level else '').replace(':', '_') test_path = os.path.join('./experiments', test_name) if path.exists(test_path): shutil.rmtree(test_path) # run the experiment in a separate thread screen.log_title("Running test {}{}".format(preset_name, ' - ' + level if level else '')) log_file_name = 'trace_test_log_{preset_name}.txt'.format(preset_name=test_name[13:]) cmd = ( 'python3 rl_coach/coach.py ' '-p {preset_name} ' '-e {test_name} ' '--seed 42 ' '-c ' '--no_summary ' '-cp {custom_param} ' '{level} ' '&> {log_file_name} ' ).format( preset_name=preset_name, test_name=test_name, log_file_name=log_file_name, level='-lvl ' + level if level else '', custom_param='\"improve_steps=EnvironmentSteps({n});' 'steps_between_evaluation_periods=EnvironmentSteps({n});' 'evaluation_steps=EnvironmentSteps(1);' 'heatup_steps=EnvironmentSteps(1024)\"'.format(n=num_env_steps) ) p = subprocess.Popen(cmd, shell=True, executable="/bin/bash", preexec_fn=os.setsid) return test_path, log_file_name, p
def reset_evaluation_state(self, val: RunPhase) -> None: starting_evaluation = (val == RunPhase.TEST) ending_evaluation = (self.phase == RunPhase.TEST) if starting_evaluation: self.accumulated_rewards_across_evaluation_episodes = 0 self.accumulated_shaped_rewards_across_evaluation_episodes = 0 self.num_successes_across_evaluation_episodes = 0 self.num_evaluation_episodes_completed = 0 if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high": screen.log_title("{}: Starting evaluation phase".format( self.name)) elif ending_evaluation: # we write to the next episode, because it could be that the current episode was already written # to disk and then we won't write it again self.agent_logger.set_current_time(self.current_episode + 1) self.agent_logger.create_signal_value( 'Evaluation Reward', self.accumulated_rewards_across_evaluation_episodes / self.num_evaluation_episodes_completed) self.agent_logger.create_signal_value( 'Shaped Evaluation Reward', self.accumulated_shaped_rewards_across_evaluation_episodes / self.num_evaluation_episodes_completed) success_rate = self.num_successes_across_evaluation_episodes / self.num_evaluation_episodes_completed self.agent_logger.create_signal_value("Success Rate", success_rate) if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high": screen.log_title( "{}: Finished evaluation phase. Success rate = {}".format( self.name, np.round(success_rate, 2)))
def update_kl_coefficient(self): # John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow # his implementation for now because we know it works well screen.log_title("KL = {}".format( self.total_kl_divergence_during_training_process)) # update kl coefficient kl_target = self.ap.algorithm.target_kl_divergence kl_coefficient = self.networks[ 'actor'].online_network.get_variable_value( self.networks['actor'].online_network.output_heads[0]. kl_coefficient) new_kl_coefficient = kl_coefficient if self.total_kl_divergence_during_training_process > 1.3 * kl_target: # kl too high => increase regularization new_kl_coefficient *= 1.5 elif self.total_kl_divergence_during_training_process < 0.7 * kl_target: # kl too low => decrease regularization new_kl_coefficient /= 1.5 # update the kl coefficient variable if kl_coefficient != new_kl_coefficient: self.networks['actor'].online_network.set_variable_value( self.networks['actor'].online_network.output_heads[0]. assign_kl_coefficient, new_kl_coefficient, self.networks['actor'].online_network.output_heads[0]. kl_coefficient_ph) screen.log_title("KL penalty coefficient change = {} -> {}".format( kl_coefficient, new_kl_coefficient))
def create_graph(self, task_parameters: TaskParameters): self.task_parameters = task_parameters if isinstance(task_parameters, DistributedTaskParameters): screen.log_title("Creating graph - name: {} task id: {} type: {}".format(self.__class__.__name__, task_parameters.task_index, task_parameters.job_type)) else: screen.log_title("Creating graph - name: {}".format(self.__class__.__name__)) # "hide" the gpu if necessary if task_parameters.use_cpu: set_cpu() # create a target server for the worker and a device if isinstance(task_parameters, DistributedTaskParameters): task_parameters.worker_target, task_parameters.device = \ self.create_worker_or_parameters_server(task_parameters=task_parameters) # create the graph modules self.level_managers, self.environments = self._create_graph(task_parameters) # set self as the parent of all the level managers self.top_level_manager = self.level_managers[0] for level_manager in self.level_managers: level_manager.parent_graph_manager = self # create a session (it needs to be created after all the graph ops were created) self.sess = None self.create_session(task_parameters=task_parameters) self._phase = self.phase = RunPhase.UNDEFINED self.setup_logger()
def learn_from_batch(self, batch): if not self.networks['main'].online_network.output_heads[0].DND.has_enough_entries(self.ap.algorithm.number_of_knn): return 0, [], 0 else: if not self.training_started: self.training_started = True screen.log_title("Finished collecting initial entries in DND. Starting to train network...") network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys() TD_targets = self.networks['main'].online_network.predict(batch.states(network_keys)) # only update the action that we have actually done in this transition for i in range(self.ap.network_wrappers['main'].batch_size): TD_targets[i, batch.actions()[i]] = batch.total_returns()[i] # set the gradients to fetch for the DND update fetches = [] head = self.networks['main'].online_network.output_heads[0] if self.ap.algorithm.propagate_updates_to_DND: fetches = [head.dnd_embeddings_grad, head.dnd_values_grad, head.dnd_indices] # train the neural network result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets, fetches) total_loss, losses, unclipped_grads = result[:3] # update the DND keys and values using the extracted gradients if self.ap.algorithm.propagate_updates_to_DND: embedding_gradients = np.swapaxes(result[-1][0], 0, 1) value_gradients = np.swapaxes(result[-1][1], 0, 1) indices = np.swapaxes(result[-1][2], 0, 1) head.DND.update_keys_and_values(batch.actions(), embedding_gradients, value_gradients, indices) return total_loss, losses, unclipped_grads
def _create_graph( self, task_parameters: TaskParameters ) -> Tuple[List[MultiAgentLevelManager], List[Environment]]: # environment loading self.env_params.seed = task_parameters.seed self.env_params.experiment_path = task_parameters.experiment_path env = short_dynamic_import(self.env_params.path)( **self.env_params.__dict__, visualization_parameters=self.visualization_parameters) # agent loading agents = OrderedDict() for agent_params in self.agents_params: agent_params.task_parameters = copy.copy(task_parameters) agent = short_dynamic_import(agent_params.path)(agent_params) agents[agent_params.name] = agent screen.log_title("Created agent: {}".format(agent_params.name)) if hasattr(self, 'memory_backend_params') and \ self.memory_backend_params.run_type == str(RunType.ROLLOUT_WORKER): agent.memory.memory_backend = deepracer_memory.DeepRacerRolloutBackEnd( self.memory_backend_params, agent_params.algorithm.num_consecutive_playing_steps, agent_params.name) # set level manager level_manager = MultiAgentLevelManager( agents=agents, environment=env, name="main_level", done_condition=self.done_condition) return [level_manager], [env]
def open_dashboard(experiment_path): dashboard_path = 'python {}/dashboard.py'.format(get_base_dir()) cmd = "{} --experiment_dir {}".format(dashboard_path, experiment_path) screen.log_title( "Opening dashboard - experiment path: {}".format(experiment_path)) # subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True, executable="/bin/bash") subprocess.Popen(cmd, shell=True, executable="/bin/bash")
def restore_checkpoint(self): self.verify_graph_was_created() # TODO: find better way to load checkpoints that were saved with a global network into the online network if hasattr(self.task_parameters, 'checkpoint_restore_dir' ) and self.task_parameters.checkpoint_restore_dir: import tensorflow as tf checkpoint_dir = self.task_parameters.checkpoint_restore_dir checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) screen.log_title("Loading checkpoint: {}".format( checkpoint.model_checkpoint_path)) variables = {} for var_name, _ in tf.contrib.framework.list_variables( self.task_parameters.checkpoint_restore_dir): # Load the variable var = tf.contrib.framework.load_variable( checkpoint_dir, var_name) # Set the new name new_name = var_name new_name = new_name.replace('global/', 'online/') variables[new_name] = var for v in self.variables_to_restore: self.sess.run(v.assign(variables[v.name.split(':')[0]]))
def heatup(self, steps: PlayingStepsType) -> None: """ Perform heatup for several steps, which means taking random actions and storing the results in memory :param steps: the number of steps as a tuple of steps time and steps count :return: None """ self.verify_graph_was_created() steps_copy = copy.copy(steps) if steps_copy.num_steps > 0: self.phase = RunPhase.HEATUP screen.log_title("{}: Starting heatup".format(self.name)) self.heatup_start_time = time.time() # reset all the levels before starting to heatup self.reset_internal_state(force_environment_reset=True) # act on the environment while steps_copy.num_steps > 0: steps_done, _ = self.act(steps_copy, continue_until_game_over=True, return_on_game_over=True) steps_copy.num_steps -= steps_done # training phase self.phase = RunPhase.UNDEFINED
def save_replay_buffer_and_exit(self): replay_buffer_path = os.path.join(self.agent_logger.experiments_path, 'replay_buffer.p') self.memory.tp = None self.memory.save(replay_buffer_path) screen.log_title( "Replay buffer was stored in {}".format(replay_buffer_path)) exit()
def init_environment_dependent_modules(self): super().init_environment_dependent_modules() self.env = self.parent_level_manager._real_environment screen.log_title("Human Control Mode") available_keys = self.env.get_available_keys() if available_keys: screen.log("Use keyboard keys to move. Press escape to quit. Available keys:") screen.log("") for action, key in self.env.get_available_keys(): screen.log("\t- {}: {}".format(action, key)) screen.separator()
def restore_checkpoint(self): self.verify_graph_was_created() # TODO: find better way to load checkpoints that were saved with a global network into the online network if self.task_parameters.checkpoint_restore_path: if os.path.isdir(self.task_parameters.checkpoint_restore_path): # a checkpoint dir if self.task_parameters.framework_type == Frameworks.tensorflow and\ 'checkpoint' in os.listdir(self.task_parameters.checkpoint_restore_path): # TODO-fixme checkpointing # MonitoredTrainingSession manages save/restore checkpoints autonomously. Doing so, # it creates it own names for the saved checkpoints, which do not match the "{}_Step-{}.ckpt" # filename pattern. The names used are maintained in a CheckpointState protobuf file named # 'checkpoint'. Using Coach's '.coach_checkpoint' protobuf file, results in an error when trying to # restore the model, as the checkpoint names defined do not match the actual checkpoint names. checkpoint = self._get_checkpoint_state_tf( self.task_parameters.checkpoint_restore_path) else: checkpoint = get_checkpoint_state( self.task_parameters.checkpoint_restore_path) if checkpoint is None: raise ValueError("No checkpoint to restore in: {}".format( self.task_parameters.checkpoint_restore_path)) model_checkpoint_path = checkpoint.model_checkpoint_path checkpoint_restore_dir = self.task_parameters.checkpoint_restore_path # Set the last checkpoint ID - only in the case of the path being a dir chkpt_state_reader = CheckpointStateReader( self.task_parameters.checkpoint_restore_path, checkpoint_state_optional=False) self.checkpoint_id = chkpt_state_reader.get_latest().num + 1 else: # a checkpoint file if self.task_parameters.framework_type == Frameworks.tensorflow: model_checkpoint_path = self.task_parameters.checkpoint_restore_path checkpoint_restore_dir = os.path.dirname( model_checkpoint_path) else: raise ValueError( "Currently restoring a checkpoint using the --checkpoint_restore_file argument is" " only supported when with tensorflow.") screen.log_title( "Loading checkpoint: {}".format(model_checkpoint_path)) self.checkpoint_saver.restore(self.sess, model_checkpoint_path) [ manager.restore_checkpoint(checkpoint_restore_dir) for manager in self.level_managers ]
def create_graph(self, task_parameters=TaskParameters(), stop_physics=None, start_physics=None, empty_service_call=None): self.graph_creation_time = time.time() self.task_parameters = task_parameters if isinstance(task_parameters, DistributedTaskParameters): screen.log_title( "Creating graph - name: {} task id: {} type: {}".format( self.__class__.__name__, task_parameters.task_index, task_parameters.job_type)) else: screen.log_title("Creating graph - name: {}".format( self.__class__.__name__)) # "hide" the gpu if necessary if task_parameters.use_cpu: set_cpu() # create a target server for the worker and a device if isinstance(task_parameters, DistributedTaskParameters): task_parameters.worker_target, task_parameters.device = \ self.create_worker_or_parameters_server(task_parameters=task_parameters) # If necessary start the physics and then stop it after agent creation if start_physics and empty_service_call: start_physics(empty_service_call()) # create the graph modules self.level_managers, self.environments = self._create_graph( task_parameters) if stop_physics and empty_service_call: stop_physics(empty_service_call()) # set self as the parent of all the level managers self.top_level_manager = self.level_managers[0] for level_manager in self.level_managers: level_manager.parent_graph_manager = self # create a session (it needs to be created after all the graph ops were created) self.sess = { agent_params.name: None for agent_params in self.agents_params } self.create_session(task_parameters=task_parameters) self._phase = self.phase = RunPhase.UNDEFINED self.setup_logger() return self
def _restore_checkpoint_tf(self, checkpoint_dir: str): import tensorflow as tf checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) screen.log_title("Loading checkpoint: {}".format( checkpoint.model_checkpoint_path)) variables = {} for var_name, _ in tf.contrib.framework.list_variables(checkpoint_dir): # Load the variable var = tf.contrib.framework.load_variable(checkpoint_dir, var_name) # Set the new name new_name = var_name new_name = new_name.replace('global/', 'online/') variables[new_name] = var for v in self.variables_to_restore: self.sess.run(v.assign(variables[v.name.split(':')[0]]))
def heatup(self, steps: PlayingStepsType) -> None: """ Perform heatup for several steps, which means taking random actions and storing the results in memory :param steps: the number of steps as a tuple of steps time and steps count :return: None """ self.verify_graph_was_created() if steps.num_steps > 0: with self.phase_context(RunPhase.HEATUP): screen.log_title("{}: Starting heatup".format(self.name)) # reset all the levels before starting to heatup self.reset_internal_state(force_environment_reset=True) # act for at least steps, though don't interrupt an episode count_end = self.current_step_counter + steps while self.current_step_counter < count_end: self.act(EnvironmentEpisodes(1))
def initialize_ope_models_and_stats(self): """ Improve a reward model of the MDP, to be used for some of the off-policy evaluation (OPE) methods. e.g. 'direct method' and 'doubly robust'. """ agent = self.level_managers[0].agents['agent'] # prepare dataset to be consumed in the expected formats for OPE agent.memory.prepare_evaluation_dataset() screen.log_title("Training a regression model for estimating MDP rewards") agent.improve_reward_model(epochs=self.reward_model_num_epochs) screen.log_title("Collecting static statistics for OPE") agent.ope_manager.gather_static_shared_stats(evaluation_dataset_as_transitions= agent.memory.evaluation_dataset_as_transitions, batch_size=agent.ap.network_wrappers['main'].batch_size, reward_model=agent.networks['reward_model'].online_network, network_keys=list(agent.ap.network_wrappers['main']. input_embedders_parameters.keys()))
def reset_evaluation_state(self, val: RunPhase) -> None: """ Perform accumulators initialization when entering an evaluation phase, and signal dumping when exiting an evaluation phase. Entering or exiting the evaluation phase is determined according to the new phase given by val, and by the current phase set in self.phase. :param val: The new phase to change to :return: None """ starting_evaluation = (val == RunPhase.TEST) ending_evaluation = (self.phase == RunPhase.TEST) if starting_evaluation: self.accumulated_rewards_across_evaluation_episodes = 0 self.accumulated_shaped_rewards_across_evaluation_episodes = 0 self.num_successes_across_evaluation_episodes = 0 self.num_evaluation_episodes_completed = 0 if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high": screen.log_title("{}: Starting evaluation phase".format(self.name)) elif ending_evaluation: # we write to the next episode, because it could be that the current episode was already written # to disk and then we won't write it again self.agent_logger.set_current_time(self.current_episode + 1) evaluation_reward = self.accumulated_rewards_across_evaluation_episodes / self.num_evaluation_episodes_completed self.agent_logger.create_signal_value( 'Evaluation Reward', evaluation_reward) self.agent_logger.create_signal_value( 'Shaped Evaluation Reward', self.accumulated_shaped_rewards_across_evaluation_episodes / self.num_evaluation_episodes_completed) success_rate = self.num_successes_across_evaluation_episodes / self.num_evaluation_episodes_completed self.agent_logger.create_signal_value( "Success Rate", success_rate ) if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high": screen.log_title("{}: Finished evaluation phase. Success rate = {}, Avg Total Reward = {}" .format(self.name, np.round(success_rate, 2), np.round(evaluation_reward, 2)))
def perform_reward_based_tests(args, preset_validation_params, preset_name): win_size = 10 test_name = '__test_reward' test_path = os.path.join('./experiments', test_name) if path.exists(test_path): shutil.rmtree(test_path) # run the experiment in a separate thread screen.log_title("Running test {}".format(preset_name)) log_file_name = 'test_log_{preset_name}.txt'.format( preset_name=preset_name) cmd = ('python3 rl_coach/coach.py ' '-p {preset_name} ' '-e {test_name} ' '-n {num_workers} ' '--seed 0 ' '-c ' '{level} ' '&> {log_file_name} ').format( preset_name=preset_name, test_name=test_name, num_workers=preset_validation_params.num_workers, log_file_name=log_file_name, level='-lvl ' + preset_validation_params.reward_test_level if preset_validation_params.reward_test_level else '') p = subprocess.Popen(cmd, shell=True, executable="/bin/bash", preexec_fn=os.setsid) start_time = time.time() reward_str = 'Evaluation Reward' if preset_validation_params.num_workers > 1: filename_pattern = 'worker_0*.csv' else: filename_pattern = '*.csv' test_passed = False # get the csv with the results csv_paths = read_csv_paths(test_path, filename_pattern) if csv_paths: csv_path = csv_paths[0] # verify results csv = None time.sleep(1) averaged_rewards = [0] last_num_episodes = 0 if not args.no_progress_bar: print_progress(averaged_rewards, last_num_episodes, preset_validation_params, start_time, args) while csv is None or ( csv['Episode #'].values[-1] < preset_validation_params.max_episodes_to_achieve_reward and time.time() - start_time < args.time_limit): try: csv = pd.read_csv(csv_path) except: # sometimes the csv is being written at the same time we are # trying to read it. no problem -> try again continue if reward_str not in csv.keys(): continue rewards = csv[reward_str].values rewards = rewards[~np.isnan(rewards)] if len(rewards) >= 1: averaged_rewards = np.convolve( rewards, np.ones(min(len(rewards), win_size)) / win_size, mode='valid') else: time.sleep(1) continue if not args.no_progress_bar: print_progress(averaged_rewards, last_num_episodes, preset_validation_params, start_time, args) if csv['Episode #'].shape[0] - last_num_episodes <= 0: continue last_num_episodes = csv['Episode #'].values[-1] # check if reward is enough if np.any(averaged_rewards >= preset_validation_params.min_reward_threshold): test_passed = True break time.sleep(1) # kill test and print result os.killpg(os.getpgid(p.pid), signal.SIGTERM) screen.log('') if test_passed: screen.success("Passed successfully") else: if time.time() - start_time > args.time_limit: screen.error("Failed due to exceeding time limit", crash=False) if args.verbose: screen.error("command exitcode: {}".format(p.returncode), crash=False) screen.error(open(log_file_name).read(), crash=False) elif csv_paths: screen.error("Failed due to insufficient reward", crash=False) if args.verbose: screen.error("command exitcode: {}".format(p.returncode), crash=False) screen.error(open(log_file_name).read(), crash=False) screen.error( "preset_validation_params.max_episodes_to_achieve_reward: {}". format( preset_validation_params.max_episodes_to_achieve_reward), crash=False) screen.error( "preset_validation_params.min_reward_threshold: {}".format( preset_validation_params.min_reward_threshold), crash=False) screen.error("averaged_rewards: {}".format(averaged_rewards), crash=False) screen.error("episode number: {}".format( csv['Episode #'].values[-1]), crash=False) else: screen.error("csv file never found", crash=False) if args.verbose: screen.error("command exitcode: {}".format(p.returncode), crash=False) screen.error(open(log_file_name).read(), crash=False) shutil.rmtree(test_path) os.remove(log_file_name) return test_passed
def _validate(graph_manager, task_parameters, transitions, s3_bucket, s3_prefix, aws_region): checkpoint = graph_manager.data_store.params.checkpoint_dict['agent'] checkpoint_dir = task_parameters.checkpoint_restore_path graph_manager.data_store.wait_for_checkpoints() # validate last checkpoint last_model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_last_checkpoint( ) if checkpoint.rl_coach_checkpoint.update( model_checkpoint_name=last_model_checkpoint_name, s3_kms_extra_args=utils.get_s3_kms_extra_args()): screen.log_title(" Validating Last Checkpoint: {}".format( last_model_checkpoint_name)) # load the last rl coach checkpoint from store graph_manager.data_store.load_from_store() graph_manager.create_graph(task_parameters) graph_manager.phase = RunPhase.TEST screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " emulate_act_on_trainer on Last Checkpoint completed!") # validate best checkpoint: Best checkpoint might not exist. best_model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint( ) if checkpoint.rl_coach_checkpoint.update( model_checkpoint_name=best_model_checkpoint_name, s3_kms_extra_args=utils.get_s3_kms_extra_args()): screen.log_title(" Validating Best Checkpoint: {}".format( best_model_checkpoint_name)) # load the best rl coach checkpoint from store graph_manager.data_store.load_from_store() graph_manager.restore_checkpoint() screen.log_title( " Start emulate_act_on_trainer on Best Checkpoint") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " emulate_act_on_trainer on Best Checkpoint completed!") else: screen.log_title(" No Best Checkpoint to validate.") else: screen.log_title(" Validating Last Checkpoint") # load the last rl coach checkpoint from store graph_manager.data_store.load_from_store() graph_manager.create_graph(task_parameters) graph_manager.phase = RunPhase.TEST screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint ") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " Start emulate_act_on_trainer on Last Checkpoint completed!") screen.log_title(" Validation completed!")
def validate(s3_bucket, s3_prefix, aws_region): screen.set_use_colors(False) screen.log_title(" S3 bucket: {} \n S3 prefix: {}".format( s3_bucket, s3_prefix)) # download model metadata model_metadata = ModelMetadata(bucket=s3_bucket, s3_key=get_s3_key( s3_prefix, MODEL_METADATA_S3_POSTFIX), region_name=aws_region, local_path=MODEL_METADATA_LOCAL_PATH) # Create model local path os.makedirs(LOCAL_MODEL_DIR) try: # Handle backward compatibility model_metadata_info = model_metadata.get_model_metadata_info() observation_list = model_metadata_info[ModelMetadataKeys.SENSOR.value] version = model_metadata_info[ModelMetadataKeys.VERSION.value] except Exception as ex: log_and_exit("Failed to parse model_metadata file: {}".format(ex), SIMAPP_VALIDATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400) # Below get_transition_data function must called before create_training_agent function # to avoid 500 in case unsupported Sensor is received. # create_training_agent will exit with 500 if unsupported sensor is received, # and get_transition_data function below will exit with 400 if unsupported sensor is received. # We want to return 400 in model validation case if unsupported sensor is received. # Thus, call this get_transition_data function before create_traning_agent function! transitions = get_transition_data(observation_list) checkpoint = Checkpoint(bucket=s3_bucket, s3_prefix=s3_prefix, region_name=args.aws_region, agent_name='agent', checkpoint_dir=LOCAL_MODEL_DIR) # make coach checkpoint compatible if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible( ): checkpoint.rl_coach_checkpoint.make_compatible( checkpoint.syncfile_ready) # add checkpoint into checkpoint_dict checkpoint_dict = {'agent': checkpoint} agent_config = { 'model_metadata': model_metadata, ConfigParams.CAR_CTRL_CONFIG.value: { ConfigParams.LINK_NAME_LIST.value: [], ConfigParams.VELOCITY_LIST.value: {}, ConfigParams.STEERING_LIST.value: {}, ConfigParams.CHANGE_START.value: None, ConfigParams.ALT_DIR.value: None, ConfigParams.MODEL_METADATA.value: model_metadata, ConfigParams.REWARD.value: None, ConfigParams.AGENT_NAME.value: 'racecar' } } agent_list = list() agent_list.append(create_training_agent(agent_config)) sm_hyperparams_dict = {} graph_manager, _ = get_graph_manager(hp_dict=sm_hyperparams_dict, agent_list=agent_list, run_phase_subject=None) ds_params_instance = S3BotoDataStoreParameters( checkpoint_dict=checkpoint_dict) graph_manager.data_store = S3BotoDataStore(ds_params_instance, graph_manager, ignore_lock=True) task_parameters = TaskParameters() task_parameters.checkpoint_restore_path = LOCAL_MODEL_DIR _validate(graph_manager=graph_manager, task_parameters=task_parameters, transitions=transitions, s3_bucket=s3_bucket, s3_prefix=s3_prefix, aws_region=aws_region)
def __init__(self, agent_parameters: AgentParameters, parent: Union['LevelManager', 'CompositeAgent'] = None): """ :param agent_parameters: A Preset class instance with all the running paramaters """ super().__init__() self.ap = agent_parameters self.task_id = self.ap.task_parameters.task_index self.is_chief = self.task_id == 0 self.shared_memory = type(agent_parameters.task_parameters) == DistributedTaskParameters \ and self.ap.memory.shared_memory if self.shared_memory: self.shared_memory_scratchpad = self.ap.task_parameters.shared_memory_scratchpad self.name = agent_parameters.name self.parent = parent self.parent_level_manager = None self.full_name_id = agent_parameters.full_name_id = self.name if type(agent_parameters.task_parameters) == DistributedTaskParameters: screen.log_title( "Creating agent - name: {} task id: {} (may take up to 30 seconds due to " "tensorflow wake up time)".format(self.full_name_id, self.task_id)) else: screen.log_title("Creating agent - name: {}".format( self.full_name_id)) self.imitation = False self.agent_logger = Logger() self.agent_episode_logger = EpisodeLogger() # get the memory # - distributed training + shared memory: # * is chief? -> create the memory and add it to the scratchpad # * not chief? -> wait for the chief to create the memory and then fetch it # - non distributed training / not shared memory: # * create memory memory_name = self.ap.memory.path.split(':')[1] self.memory_lookup_name = self.full_name_id + '.' + memory_name if self.shared_memory and not self.is_chief: self.memory = self.shared_memory_scratchpad.get( self.memory_lookup_name) else: # modules if agent_parameters.memory.load_memory_from_file_path: screen.log_title( "Loading replay buffer from pickle. Pickle path: {}". format(agent_parameters.memory.load_memory_from_file_path)) self.memory = read_pickle( agent_parameters.memory.load_memory_from_file_path) else: self.memory = dynamic_import_and_instantiate_module_from_params( self.ap.memory) if self.shared_memory and self.is_chief: self.shared_memory_scratchpad.add(self.memory_lookup_name, self.memory) # set devices if type(agent_parameters.task_parameters) == DistributedTaskParameters: self.has_global = True self.replicated_device = agent_parameters.task_parameters.device self.worker_device = "/job:worker/task:{}".format(self.task_id) else: self.has_global = False self.replicated_device = None self.worker_device = "" if agent_parameters.task_parameters.use_cpu: self.worker_device += "/cpu:0" else: self.worker_device += "/device:GPU:0" # filters self.input_filter = self.ap.input_filter self.output_filter = self.ap.output_filter self.pre_network_filter = self.ap.pre_network_filter device = self.replicated_device if self.replicated_device else self.worker_device self.input_filter.set_device(device) self.output_filter.set_device(device) self.pre_network_filter.set_device(device) # initialize all internal variables self._phase = RunPhase.HEATUP self.total_shaped_reward_in_current_episode = 0 self.total_reward_in_current_episode = 0 self.total_steps_counter = 0 self.running_reward = None self.training_iteration = 0 self.last_target_network_update_step = 0 self.last_training_phase_step = 0 self.current_episode = self.ap.current_episode = 0 self.curr_state = {} self.current_hrl_goal = None self.current_episode_steps_counter = 0 self.episode_running_info = {} self.last_episode_evaluation_ran = 0 self.running_observations = [] self.agent_logger.set_current_time(self.current_episode) self.exploration_policy = None self.networks = {} self.last_action_info = None self.running_observation_stats = None self.running_reward_stats = None self.accumulated_rewards_across_evaluation_episodes = 0 self.accumulated_shaped_rewards_across_evaluation_episodes = 0 self.num_successes_across_evaluation_episodes = 0 self.num_evaluation_episodes_completed = 0 self.current_episode_buffer = Episode( discount=self.ap.algorithm.discount) # TODO: add agents observation rendering for debugging purposes (not the same as the environment rendering) # environment parameters self.spaces = None self.in_action_space = self.ap.algorithm.in_action_space # signals self.episode_signals = [] self.step_signals = [] self.loss = self.register_signal('Loss') self.curr_learning_rate = self.register_signal('Learning Rate') self.unclipped_grads = self.register_signal('Grads (unclipped)') self.reward = self.register_signal('Reward', dump_one_value_per_episode=False, dump_one_value_per_step=True) self.shaped_reward = self.register_signal( 'Shaped Reward', dump_one_value_per_episode=False, dump_one_value_per_step=True) if isinstance(self.in_action_space, GoalsSpace): self.distance_from_goal = self.register_signal( 'Distance From Goal', dump_one_value_per_step=True) # use seed if self.ap.task_parameters.seed is not None: random.seed(self.ap.task_parameters.seed) np.random.seed(self.ap.task_parameters.seed)
def _validate(graph_manager, task_parameters, transitions, s3_bucket, s3_prefix, aws_region): checkpoint_dir = task_parameters.checkpoint_restore_path wait_for_checkpoint(checkpoint_dir, graph_manager.data_store) if utils.do_model_selection(s3_bucket=s3_bucket, s3_prefix=s3_prefix, region=aws_region, checkpoint_type=LAST_CHECKPOINT): screen.log_title(" Validating Last Checkpoint: {}".format( utils.get_last_checkpoint(s3_bucket, s3_prefix, aws_region))) graph_manager.create_graph(task_parameters) graph_manager.phase = RunPhase.TEST screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " emulate_act_on_trainer on Last Checkpoint completed!") # Best checkpoint might not exist. if utils.do_model_selection(s3_bucket=s3_bucket, s3_prefix=s3_prefix, region=aws_region, checkpoint_type=BEST_CHECKPOINT): screen.log_title(" Validating Best Checkpoint: {}".format( utils.get_best_checkpoint(s3_bucket, s3_prefix, aws_region))) graph_manager.data_store.load_from_store() graph_manager.restore_checkpoint() screen.log_title( " Start emulate_act_on_trainer on Best Checkpoint") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " emulate_act_on_trainer on Best Checkpoint completed!") else: screen.log_title(" No Best Checkpoint to validate.") else: screen.log_title(" Validating Last Checkpoint") graph_manager.create_graph(task_parameters) graph_manager.phase = RunPhase.TEST screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint ") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " Start emulate_act_on_trainer on Last Checkpoint completed!") screen.log_title(" Validation completed!")
def display_all_presets_and_exit(self): # list available presets screen.log_title("Available Presets:") for preset in sorted(list_all_presets()): print(preset) sys.exit(0)
def handle_distributed_coach_orchestrator(args): from rl_coach.orchestrators.kubernetes_orchestrator import KubernetesParameters, Kubernetes, \ RunTypeParameters ckpt_inside_container = "/checkpoint" arg_list = sys.argv[1:] try: i = arg_list.index('--distributed_coach_run_type') arg_list.pop(i) arg_list.pop(i) except ValueError: pass trainer_command = [ 'python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER) ] + arg_list rollout_command = [ 'python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER) ] + arg_list if '--experiment_name' not in rollout_command: rollout_command = rollout_command + [ '--experiment_name', args.experiment_name ] if '--experiment_name' not in trainer_command: trainer_command = trainer_command + [ '--experiment_name', args.experiment_name ] memory_backend_params = None if args.memory_backend == "redispubsub": memory_backend_params = RedisPubSubMemoryBackendParameters() ds_params_instance = None if args.data_store == "s3": ds_params = DataStoreParameters("s3", "", "") ds_params_instance = S3DataStoreParameters( ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name, creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container, expt_dir=args.experiment_path) elif args.data_store == "nfs": ds_params = DataStoreParameters("nfs", "kubernetes", "") ds_params_instance = NFSDataStoreParameters(ds_params) worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str( RunType.ROLLOUT_WORKER), num_replicas=args.num_workers) trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER)) orchestration_params = KubernetesParameters( [worker_run_type_params, trainer_run_type_params], kubeconfig='~/.kube/config', memory_backend_parameters=memory_backend_params, data_store_params=ds_params_instance) orchestrator = Kubernetes(orchestration_params) if not orchestrator.setup(): print("Could not setup.") return 1 if orchestrator.deploy_trainer(): print("Successfully deployed trainer.") else: print("Could not deploy trainer.") return 1 if orchestrator.deploy_worker(): print("Successfully deployed rollout worker(s).") else: print("Could not deploy rollout worker(s).") return 1 if args.dump_worker_logs: screen.log_title("Dumping rollout worker logs in: {}".format( args.experiment_path)) orchestrator.worker_logs(path=args.experiment_path) exit_code = 1 try: exit_code = orchestrator.trainer_logs() except KeyboardInterrupt: pass orchestrator.undeploy() return exit_code
def perform_trace_based_tests(args, preset_name, num_env_steps, level=None): test_name = '__test_trace' test_path = os.path.join('./experiments', test_name) if path.exists(test_path): shutil.rmtree(test_path) # run the experiment in a separate thread screen.log_title("Running test {}{}".format(preset_name, ' - ' + level if level else '')) log_file_name = 'test_log_{preset_name}.txt'.format( preset_name=preset_name) cmd = ('python3 rl_coach/coach.py ' '-p {preset_name} ' '-e {test_name} ' '--seed 42 ' '-c ' '--no_summary ' '-cp {custom_param} ' '{level} ' '&> {log_file_name} ').format( preset_name=preset_name, test_name=test_name, log_file_name=log_file_name, level='-lvl ' + level if level else '', custom_param='\"improve_steps=EnvironmentSteps({n});' 'steps_between_evaluation_periods=EnvironmentSteps({n});' 'evaluation_steps=EnvironmentSteps(1);' 'heatup_steps=EnvironmentSteps(1024)\"'.format(n=num_env_steps)) p = subprocess.Popen(cmd, shell=True, executable="/bin/bash", preexec_fn=os.setsid) p.wait() filename_pattern = '*.csv' # get the csv with the results csv_paths = read_csv_paths(test_path, filename_pattern) test_passed = False if not csv_paths: screen.error("csv file never found", crash=False) if args.verbose: screen.error("command exitcode: {}".format(p.returncode), crash=False) screen.error(open(log_file_name).read(), crash=False) else: trace_path = os.path.join( './rl_coach', 'traces', preset_name + '_' + level.replace(':', '_') if level else preset_name, '') if not os.path.exists(trace_path): screen.log( 'No trace found, creating new trace in: {}'.format(trace_path)) os.makedirs(os.path.dirname(trace_path)) df = pd.read_csv(csv_paths[0]) df = clean_df(df) df.to_csv(os.path.join(trace_path, 'trace.csv'), index=False) screen.success("Successfully created new trace.") test_passed = True else: test_df = pd.read_csv(csv_paths[0]) test_df = clean_df(test_df) new_trace_csv_path = os.path.join(trace_path, 'trace_new.csv') test_df.to_csv(new_trace_csv_path, index=False) test_df = pd.read_csv(new_trace_csv_path) trace_csv_path = glob.glob(path.join(trace_path, 'trace.csv')) trace_csv_path = trace_csv_path[0] trace_df = pd.read_csv(trace_csv_path) test_passed = test_df.equals(trace_df) if test_passed: screen.success("Passed successfully.") os.remove(new_trace_csv_path) test_passed = True else: screen.error("Trace test failed.", crash=False) if args.overwrite: os.remove(trace_csv_path) os.rename(new_trace_csv_path, trace_csv_path) screen.error("Overwriting old trace.", crash=False) else: screen.error("bcompare {} {}".format( trace_csv_path, new_trace_csv_path), crash=False) shutil.rmtree(test_path) os.remove(log_file_name) return test_passed
agent_params.exploration.noise_percentage_schedule = ConstantSchedule(0) agent_params.exploration.evaluation_noise_percentage = 0 # no playing during the training phase agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(0) # use the following command line to download and extract the CARLA dataset: # python rl_coach/utilities/carla_dataset_to_replay_buffer.py agent_params.memory.load_memory_from_file_path = "./datasets/carla_train_set_replay_buffer.p" agent_params.memory.state_key_with_the_class_index = 'high_level_command' agent_params.memory.num_classes = 4 # download dataset if it doesn't exist if not os.path.exists(agent_params.memory.load_memory_from_file_path): screen.log_title( "The CARLA dataset is not present in the following path: {}".format( agent_params.memory.load_memory_from_file_path)) result = screen.ask_yes_no("Do you want to download it now?") if result: create_dataset(None, "./datasets/carla_train_set_replay_buffer.p") else: screen.error( "Please update the path to the CARLA dataset in the CARLA_CIL preset", crash=True) ############### # Environment # ############### env_params = CarlaEnvironmentParameters() env_params.cameras = ['CameraRGB'] env_params.camera_height = 600