def __init__(self): super().__init__() self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps( 10000) self.num_consecutive_playing_steps = EnvironmentSteps(4) self.discount = 0.99 self.supports_parameter_noise = True
def _validate(graph_manager, task_parameters, transitions, s3_bucket, s3_prefix, aws_region): checkpoint_dir = task_parameters.checkpoint_restore_path wait_for_checkpoint(checkpoint_dir, graph_manager.data_store) if utils.do_model_selection(s3_bucket=s3_bucket, s3_prefix=s3_prefix, region=aws_region, checkpoint_type=LAST_CHECKPOINT): logger.info( "Test Last Checkpoint: %s", utils.get_best_checkpoint(s3_bucket, s3_prefix, aws_region)) graph_manager.create_graph(task_parameters) graph_manager.phase = RunPhase.TEST graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) logger.info( "Test Best Checkpoint: %s", utils.get_last_checkpoint(s3_bucket, s3_prefix, aws_region)) utils.do_model_selection(s3_bucket=s3_bucket, s3_prefix=s3_prefix, region=aws_region, checkpoint_type=BEST_CHECKPOINT) graph_manager.data_store.load_from_store() graph_manager.restore_checkpoint() graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) else: logger.info("Test Last Checkpoint") graph_manager.create_graph(task_parameters) graph_manager.phase = RunPhase.TEST graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions)
def tournament_worker(graph_manager, number_of_trials, task_parameters, s3_writers, is_continuous): """ Tournament worker function Arguments: graph_manager {[MultiAgentGraphManager]} -- [Graph manager of multiagent graph manager] number_of_trials {[int]} -- [Number of trails you want to run the evaluation] task_parameters {[TaskParameters]} -- [Information of the checkpoint, gpu/cpu, framework etc of rlcoach] s3_writers {[S3Writer]} -- [Information to upload to the S3 bucket all the simtrace and mp4] is_continuous {bool} -- [The termination condition for the car] """ checkpoint_dirs = list() agent_names = list() subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list(), list() subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list() for agent_param in graph_manager.agents_params: _checkpoint_dir = task_parameters.checkpoint_restore_path if len(graph_manager.agents_params) == 1 \ else os.path.join(task_parameters.checkpoint_restore_path, agent_param.name) agent_names.append(agent_param.name) checkpoint_dirs.append(_checkpoint_dir) racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1 \ else "racecar_{}".format(agent_param.name.split("_")[1]) subscribe_to_save_mp4_topic.append("/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name)) unsubscribe_from_save_mp4_topic.append("/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name)) wait_for_checkpoints(checkpoint_dirs, graph_manager.data_store) modify_checkpoint_variables(checkpoint_dirs, agent_names) # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics') rospy.wait_for_service('/gazebo/unpause_physics') pause_physics = ServiceProxyWrapper('/gazebo/pause_physics', Empty) unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics', Empty) for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic): rospy.wait_for_service(mp4_sub) rospy.wait_for_service(mp4_unsub) for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic): subscribe_to_save_mp4.append(ServiceProxyWrapper(mp4_sub, Empty)) unsubscribe_from_save_mp4.append(ServiceProxyWrapper(mp4_unsub, Empty)) graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics, start_physics=unpause_physics, empty_service_call=EmptyRequest) unpause_physics(EmptyRequest()) graph_manager.reset_internal_state(True) is_save_mp4_enabled = rospy.get_param('MP4_S3_BUCKET', None) if is_save_mp4_enabled: for subscribe_mp4 in subscribe_to_save_mp4: subscribe_mp4(EmptyRequest()) if is_continuous: graph_manager.evaluate(EnvironmentSteps(1)) else: for _ in range(number_of_trials): graph_manager.evaluate(EnvironmentSteps(1)) if is_save_mp4_enabled: for unsubscribe_mp4 in unsubscribe_from_save_mp4: unsubscribe_mp4(EmptyRequest()) for s3_writer in s3_writers: s3_writer.upload_to_s3() time.sleep(1) pause_physics(EmptyRequest())
def coach_adc(model, dataset, arch, data_loader, validate_fn, save_checkpoint_fn): task_parameters = TaskParameters(framework_type="tensorflow", experiment_path="./experiments/test") extra_params = {'save_checkpoint_secs': None, 'render': True} task_parameters.__dict__.update(extra_params) # Create a dictionary of parameters that Coach will handover to CNNEnvironment # Once it creates it. if True: exploration_noise = 0.5 #exploration_noise = 0.25 exploitation_decay = 0.996 graph_manager.env_params.additional_simulator_parameters = { 'model': model, 'dataset': dataset, 'arch': arch, 'data_loader': data_loader, 'validate_fn': validate_fn, 'save_checkpoint_fn': save_checkpoint_fn, #'action_range': (0.10, 0.95), 'action_range': (0.70, 0.95), 'onehot_encoding': False, 'normalize_obs': True, 'desired_reduction': None, 'reward_fn': lambda top1, top5, vloss, total_macs: -1 * (1-top5/100) * math.log(total_macs) #'reward_fn': lambda top1, total_macs: -1 * (1-top1/100) * math.log(total_macs) #'reward_fn': lambda top1, total_macs: -1 * max(1-top1/100, 0.25) * math.log(total_macs) #'reward_fn': lambda top1, total_macs: -1 * (1-top1/100) * math.log(total_macs/100000) #'reward_fn': lambda top1, total_macs: top1/100 * total_macs/self.dense_model_macs } else: exploration_noise = 0.5 #exploration_noise = 0.25 exploitation_decay = 0.996 graph_manager.env_params.additional_simulator_parameters = { 'model': model, 'dataset': dataset, 'arch': arch, 'data_loader': data_loader, 'validate_fn': validate_fn, 'save_checkpoint_fn': save_checkpoint_fn, 'action_range': (0.10, 0.95), 'onehot_encoding': False, 'normalize_obs': True, 'desired_reduction': 1.5e8, 'reward_fn': lambda top1, total_macs: top1/100 #'reward_fn': lambda top1, total_macs: min(top1/100, 0.75) } #msglogger.debug('Experiment configuarion:\n' + json.dumps(graph_manager.env_params.additional_simulator_parameters, indent=2)) steps_per_episode = 13 agent_params.exploration.noise_percentage_schedule = PieceWiseSchedule([(ConstantSchedule(exploration_noise), EnvironmentSteps(100*steps_per_episode)), (ExponentialSchedule(exploration_noise, 0, exploitation_decay), EnvironmentSteps(300*steps_per_episode))]) graph_manager.create_graph(task_parameters) graph_manager.improve()
def __init__(self): super().__init__() self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1) self.rate_for_copying_weights_to_target = 0.001 self.num_consecutive_playing_steps = EnvironmentSteps(1) self.use_target_network_for_evaluation = False self.action_penalty = 0 self.clip_critic_targets = None # expected to be a tuple of the form (min_clip_value, max_clip_value) or None self.use_non_zero_discount_for_terminal_states = False
def __init__(self): super().__init__() self.architecture_num_q_heads = 10 self.bootstrapped_data_sharing_probability = 1.0 self.epsilon_schedule = PieceWiseSchedule([ (LinearSchedule(1, 0.1, 1000000), EnvironmentSteps(1000000)), (LinearSchedule(0.1, 0.01, 4000000), EnvironmentSteps(4000000)) ]) self.lamb = 0.1
def _validate(graph_manager, task_parameters, transitions, s3_bucket, s3_prefix, aws_region): checkpoint = graph_manager.data_store.params.checkpoint_dict['agent'] checkpoint_dir = task_parameters.checkpoint_restore_path graph_manager.data_store.wait_for_checkpoints() # validate last checkpoint last_model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_last_checkpoint( ) if checkpoint.rl_coach_checkpoint.update( model_checkpoint_name=last_model_checkpoint_name, s3_kms_extra_args=utils.get_s3_kms_extra_args()): screen.log_title(" Validating Last Checkpoint: {}".format( last_model_checkpoint_name)) # load the last rl coach checkpoint from store graph_manager.data_store.load_from_store() graph_manager.create_graph(task_parameters) graph_manager.phase = RunPhase.TEST screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " emulate_act_on_trainer on Last Checkpoint completed!") # validate best checkpoint: Best checkpoint might not exist. best_model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint( ) if checkpoint.rl_coach_checkpoint.update( model_checkpoint_name=best_model_checkpoint_name, s3_kms_extra_args=utils.get_s3_kms_extra_args()): screen.log_title(" Validating Best Checkpoint: {}".format( best_model_checkpoint_name)) # load the best rl coach checkpoint from store graph_manager.data_store.load_from_store() graph_manager.restore_checkpoint() screen.log_title( " Start emulate_act_on_trainer on Best Checkpoint") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " emulate_act_on_trainer on Best Checkpoint completed!") else: screen.log_title(" No Best Checkpoint to validate.") else: screen.log_title(" Validating Last Checkpoint") # load the last rl coach checkpoint from store graph_manager.data_store.load_from_store() graph_manager.create_graph(task_parameters) graph_manager.phase = RunPhase.TEST screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint ") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " Start emulate_act_on_trainer on Last Checkpoint completed!") screen.log_title(" Validation completed!")
def start_race(self): """ Start the race (evaluation) for the current racer. """ LOG.info( "[virtual event manager] Starting race for racer %s", self._current_racer.racerAlias ) # update the car on current model if does not use f1 or tron type of shell if const.F1 not in self._body_shell_type.lower(): self._model_updater.update_model_color( self._current_car_model_state.model_name, self._current_racer.carConfig.carColor ) # send request if self._is_save_mp4_enabled: self._subscribe_to_save_mp4( VirtualEventVideoEditSrvRequest( display_name=self._current_racer.racerAlias, racecar_color=self._current_racer.carConfig.carColor, ) ) # Update CameraManager by adding cameras into the current namespace. By doing so # a single follow car camera will follow the current active racecar. self._camera_manager.add( self._main_cameras[VIRTUAL_EVENT], self._current_car_model_state.model_name ) self._camera_manager.add(self._sub_camera, self._current_car_model_state.model_name) configure_environment_randomizer() # strip index for park position self._park_position_idx = get_racecar_idx(self._current_car_model_state.model_name) # set the park position in track and do evaluation # Before each evaluation episode (single lap for non-continuous race and complete race for # continuous race), a new copy of park_positions needs to be loaded into track_data because # a park position will be pop from park_positions when a racer car need to be parked. # unpause the physics in current world self._model_updater.unpause_physics() LOG.info("[virtual event manager] Unpaused physics in current world.") if ( self._prev_model_name is not None and self._prev_model_name != self._current_car_model_state.model_name ): # disable the links on the prev car # we are doing it here because we don't want the car to float around # after the link is disabled prev_car_model_state = ModelState() prev_car_model_state.model_name = self._prev_model_name LOG.info("[virtual event manager] Unpaused model for current car.") if self._is_continuous: self._track_data.park_positions = [self._park_positions[self._park_position_idx]] self._current_graph_manager.evaluate(EnvironmentSteps(1)) else: for _ in range(self._number_of_trials): self._track_data.park_positions = [self._park_positions[self._park_position_idx]] self._current_graph_manager.evaluate(EnvironmentSteps(1))
def coach_adc(model, dataset, arch, optimizer_data, validate_fn, save_checkpoint_fn, train_fn): # task_parameters = TaskParameters(framework_type="tensorflow", # experiment_path="./experiments/test") # extra_params = {'save_checkpoint_secs': None, # 'render': True} # task_parameters.__dict__.update(extra_params) task_parameters = TaskParameters(experiment_path=logger.get_experiment_path('adc')) conv_cnt = count_conv_layer(model) # Create a dictionary of parameters that Coach will handover to CNNEnvironment # Once it creates it. services = distiller.utils.MutableNamedTuple({ 'validate_fn': validate_fn, 'save_checkpoint_fn': save_checkpoint_fn, 'train_fn': train_fn}) app_args = distiller.utils.MutableNamedTuple({ 'dataset': dataset, 'arch': arch, 'optimizer_data': optimizer_data}) if True: amc_cfg = distiller.utils.MutableNamedTuple({ #'action_range': (0.20, 0.95), 'action_range': (0.20, 0.80), 'onehot_encoding': False, 'normalize_obs': True, 'desired_reduction': None, 'reward_fn': lambda top1, top5, vloss, total_macs: -1 * (1-top1/100) * math.log(total_macs), 'conv_cnt': conv_cnt, 'max_reward': -1000}) else: amc_cfg = distiller.utils.MutableNamedTuple({ 'action_range': (0.10, 0.95), 'onehot_encoding': False, 'normalize_obs': True, 'desired_reduction': 1.5e8, 'reward_fn': lambda top1, top5, vloss, total_macs: top1/100, #'reward_fn': lambda top1, total_macs: min(top1/100, 0.75), 'conv_cnt': conv_cnt, 'max_reward': -1000}) # These parameters are passed to the Distiller environment graph_manager.env_params.additional_simulator_parameters = {'model': model, 'app_args': app_args, 'amc_cfg': amc_cfg, 'services': services} exploration_noise = 0.5 exploitation_decay = 0.996 steps_per_episode = conv_cnt agent_params.exploration.noise_percentage_schedule = PieceWiseSchedule([ (ConstantSchedule(exploration_noise), EnvironmentSteps(100*steps_per_episode)), (ExponentialSchedule(exploration_noise, 0, exploitation_decay), EnvironmentSteps(300*steps_per_episode))]) graph_manager.create_graph(task_parameters) graph_manager.improve()
def fetch_from_worker(self, num_consecutive_playing_steps=None): if hasattr(self, 'memory_backend'): with self.phase_context(RunPhase.TRAIN): for transition in self.memory_backend.fetch( num_consecutive_playing_steps): self.emulate_act_on_trainer(EnvironmentSteps(1), transition)
def __init__( self, name: str, agents: Union['Agent', CompositeAgent, Dict[str, Union['Agent', CompositeAgent]]], environment: Union['LevelManager', Environment], real_environment: Environment = None, steps_limit: EnvironmentSteps = EnvironmentSteps(1), should_reset_agent_state_after_time_limit_passes: bool = False, spaces_definition: SpacesDefinition = None): """ A level manager controls a single or multiple composite agents and a single environment. The environment can be either a real environment or another level manager behaving as an environment. :param agents: a list of agents or composite agents to control :param environment: an environment or level manager to control :param real_environment: the real environment that is is acted upon. if this is None (which it should be for the most bottom level), it will be replaced by the environment parameter. For simple RL schemes, where there is only a single level of hierarchy, this removes the requirement of defining both the environment and the real environment, as they are the same. :param steps_limit: the number of time steps to run when stepping the internal components :param should_reset_agent_state_after_time_limit_passes: reset the agent after stepping for steps_limit :param name: the level's name :param spaces_definition: external definition of spaces for when we don't have an environment (e.g. batch-rl) """ super().__init__() if not isinstance(agents, dict): # insert the single composite agent to a dictionary for compatibility agents = {agents.name: agents} if real_environment is None: self._real_environment = real_environment = environment self.agents = agents self.environment = environment self.real_environment = real_environment self.steps_limit = steps_limit self.should_reset_agent_state_after_time_limit_passes = should_reset_agent_state_after_time_limit_passes self.full_name_id = self.name = name self._phase = RunPhase.HEATUP self.reset_required = False # set self as the parent for all the composite agents for agent in self.agents.values(): agent.parent = self agent.parent_level_manager = self # create all agents in all composite_agents - we do it here so agents will have access to their level manager for agent in self.agents.values(): if isinstance(agent, CompositeAgent): agent.create_agents() if not isinstance(self.steps_limit, EnvironmentSteps): raise ValueError( "The num consecutive steps for acting must be defined in terms of environment steps" ) self.build(spaces_definition) # there are cases where we don't have an environment. e.g. in batch-rl or in imitation learning. self.last_env_response = self.real_environment.last_env_response if self.real_environment else None self.parent_graph_manager = None
def evaluation_worker(graph_manager, data_store, number_of_trials, task_parameters): checkpoint_dir = task_parameters.checkpoint_restore_path wait_for_checkpoint(checkpoint_dir, data_store) # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics') rospy.wait_for_service('/gazebo/unpause_physics') pause_physics = ServiceProxyWrapper('/gazebo/pause_physics', Empty) unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics', Empty) graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics, start_physics=unpause_physics, empty_service_call=EmptyRequest) # Instantiate Cameras configure_camera() unpause_physics(EmptyRequest()) graph_manager.reset_internal_state(True) for _ in range(number_of_trials): graph_manager.evaluate(EnvironmentSteps(1)) # Close the down the job utils.cancel_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), rospy.get_param('AWS_REGION'))
def _create_graph(self, task_parameters: TaskParameters) -> Tuple[List[LevelManager], List[Environment]]: env = short_dynamic_import(self.env_params.path)(**self.env_params.__dict__, visualization_parameters=self.visualization_parameters) for agent_params in self.agents_params: agent_params.task_parameters = task_parameters # we need to build the hierarchy in reverse order (from the bottom up) in order for the spaces of each level # to be known level_managers = [] current_env = env # out_action_space = env.action_space for level_idx, agent_params in reversed(list(enumerate(self.agents_params))): agent_params.name = "agent_{}".format(level_idx) agent_params.is_a_highest_level_agent = level_idx == 0 agent_params.is_a_lowest_level_agent = level_idx == len(self.agents_params) - 1 agent = short_dynamic_import(agent_params.path)(agent_params) level_manager = LevelManager( agents=agent, environment=current_env, real_environment=env, steps_limit=EnvironmentSteps(1) if level_idx == 0 else self.consecutive_steps_to_run_non_top_levels, should_reset_agent_state_after_time_limit_passes=level_idx > 0, name="level_{}".format(level_idx) ) current_env = level_manager level_managers.insert(0, level_manager) return level_managers, [env]
def evaluate(params): # file params experiment_path = os.path.join(params.output_data_dir) logger.experiment_path = os.path.join(experiment_path, 'evaluation') params.checkpoint_restore_dir = os.path.join(params.input_data_dir, 'checkpoint') checkpoint_file = os.path.join(params.checkpoint_restore_dir, 'checkpoint') inplace_change(checkpoint_file, "/opt/ml/output/data/checkpoint", ".") # Note that due to a tensorflow issue (https://github.com/tensorflow/tensorflow/issues/9146) we need to replace # the absolute path for the evaluation-from-a-checkpointed-model to work vis_params = VisualizationParameters() vis_params.dump_gifs = True task_params = TaskParameters(evaluate_only=True, experiment_path=logger.experiment_path) task_params.__dict__ = add_items_to_dict(task_params.__dict__, params.__dict__) graph_manager = BasicRLGraphManager( agent_params=ClippedPPOAgentParameters(), env_params=GymVectorEnvironment(level='TSP_env:TSPEasyEnv'), schedule_params=ScheduleParameters(), vis_params=vis_params) graph_manager = graph_manager.create_graph(task_parameters=task_params) graph_manager.evaluate(EnvironmentSteps(5))
def __init__(self): super().__init__() self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps( 1) self.rate_for_copying_weights_to_target = 0.005 # evaluate agent using deterministic policy (i.e. take the mean value) self.use_deterministic_for_evaluation = True
def __init__(self): # Architecture parameters self.use_accumulated_reward_as_measurement = False # Agent parameters self.num_consecutive_playing_steps = EnvironmentSteps(1) self.num_consecutive_training_steps = 1 # TODO: update this to TrainingSteps self.heatup_using_network_decisions = False self.discount = 0.99 self.apply_gradients_every_x_episodes = 5 self.num_steps_between_copying_online_weights_to_target = TrainingSteps( 0) self.rate_for_copying_weights_to_target = 1.0 self.load_memory_from_file_path = None self.store_transitions_only_when_episodes_are_terminated = False # HRL / HER related params self.in_action_space = None # distributed agents params self.share_statistics_between_workers = True # intrinsic reward self.scale_external_reward_by_intrinsic_reward_value = False # n-step returns self.n_step = -1 # calculate the total return (no bootstrap, by default) # Distributed Coach params self.distributed_coach_synchronization_type = None # Should the workers wait for full episode self.act_for_full_episodes = False
def __init__(self): # Architecture parameters self.use_accumulated_reward_as_measurement = False # Agent parameters self.num_consecutive_playing_steps = EnvironmentSteps(1) self.num_consecutive_training_steps = 1 # TODO: update this to TrainingSteps self.heatup_using_network_decisions = False self.discount = 0.99 self.apply_gradients_every_x_episodes = 5 self.num_steps_between_copying_online_weights_to_target = TrainingSteps( 0) self.rate_for_copying_weights_to_target = 1.0 self.load_memory_from_file_path = None self.collect_new_data = True self.store_transitions_only_when_episodes_are_terminated = False # HRL / HER related params self.in_action_space = None # distributed agents params self.share_statistics_between_workers = True # intrinsic reward self.scale_external_reward_by_intrinsic_reward_value = False
def fetch_from_worker(self, num_consecutive_playing_steps=None): if hasattr(self, 'memory_backend'): for transitions in self.memory_backend.fetch( num_consecutive_playing_steps): self.emulate_act_on_trainer(EnvironmentSteps(1), transitions) if hasattr(self, 'sample_collector'): self.sample_collector.sample(transitions)
def start_single_threaded(self, task_parameters, graph_manager, args): """Override to use custom evaluate_steps, instead of infinite steps. Just evaluate. """ graph_manager.agent_params.visualization.dump_csv = False # issues with CSV export in evaluation only graph_manager.create_graph(task_parameters) graph_manager.evaluate(EnvironmentSteps(args.evaluate_steps)) graph_manager.close()
def __init__(self): super().__init__() self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps( 10000) self.apply_gradients_every_x_episodes = 1 self.num_steps_between_gradient_updates = 5 # this is called t_max in all the papers self.targets_horizon = 'N-Step'
def rollout_worker(graph_manager, data_store, num_workers, task_parameters): """ wait for first checkpoint then perform rollouts using the model """ if not data_store: raise AttributeError("None type for data_store object") checkpoint_dir = task_parameters.checkpoint_restore_path wait_for_checkpoint(checkpoint_dir, data_store) wait_for_trainer_ready(checkpoint_dir, data_store) # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics') rospy.wait_for_service('/gazebo/unpause_physics') pause_physics = ServiceProxyWrapper('/gazebo/pause_physics', Empty) unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics', Empty) graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics, start_physics=unpause_physics, empty_service_call=EmptyRequest) with graph_manager.phase_context(RunPhase.TRAIN): chkpt_state_reader = CheckpointStateReader( checkpoint_dir, checkpoint_state_optional=False) last_checkpoint = chkpt_state_reader.get_latest().num for level in graph_manager.level_managers: for agent in level.agents.values(): agent.memory.memory_backend.set_current_checkpoint( last_checkpoint) # this worker should play a fraction of the total playing steps per rollout act_steps = 1 while True: exit_if_trainer_done(checkpoint_dir) unpause_physics(EmptyRequest()) graph_manager.reset_internal_state(True) graph_manager.act(EnvironmentSteps(num_steps=act_steps), wait_for_full_episodes=graph_manager. agent_params.algorithm.act_for_full_episodes) graph_manager.reset_internal_state(True) time.sleep(1) pause_physics(EmptyRequest()) new_checkpoint = data_store.get_latest_checkpoint() if new_checkpoint and new_checkpoint > last_checkpoint: if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.SYNC: exit_if_trainer_done(checkpoint_dir) data_store.load_from_store( expected_checkpoint_number=last_checkpoint + 1) graph_manager.restore_checkpoint() if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.ASYNC: graph_manager.restore_checkpoint() last_checkpoint = new_checkpoint for level in graph_manager.level_managers: for agent in level.agents.values(): agent.memory.memory_backend.set_current_checkpoint( last_checkpoint)
def __init__(self): super().__init__() self.v_min = -10.0 self.v_max = 10.0 self.atoms = 51 self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps( 32000 // 4) # 32k frames self.n_steps = 3
def _validate(graph_manager, task_parameters, transitions, s3_bucket, s3_prefix, aws_region): checkpoint_dir = task_parameters.checkpoint_restore_path wait_for_checkpoint(checkpoint_dir, graph_manager.data_store) if utils.do_model_selection(s3_bucket=s3_bucket, s3_prefix=s3_prefix, region=aws_region, checkpoint_type=LAST_CHECKPOINT): screen.log_title(" Validating Last Checkpoint: {}".format( utils.get_last_checkpoint(s3_bucket, s3_prefix, aws_region))) graph_manager.create_graph(task_parameters) graph_manager.phase = RunPhase.TEST screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " emulate_act_on_trainer on Last Checkpoint completed!") # Best checkpoint might not exist. if utils.do_model_selection(s3_bucket=s3_bucket, s3_prefix=s3_prefix, region=aws_region, checkpoint_type=BEST_CHECKPOINT): screen.log_title(" Validating Best Checkpoint: {}".format( utils.get_best_checkpoint(s3_bucket, s3_prefix, aws_region))) graph_manager.data_store.load_from_store() graph_manager.restore_checkpoint() screen.log_title( " Start emulate_act_on_trainer on Best Checkpoint") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " emulate_act_on_trainer on Best Checkpoint completed!") else: screen.log_title(" No Best Checkpoint to validate.") else: screen.log_title(" Validating Last Checkpoint") graph_manager.create_graph(task_parameters) graph_manager.phase = RunPhase.TEST screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint ") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " Start emulate_act_on_trainer on Last Checkpoint completed!") screen.log_title(" Validation completed!")
def __init__(self, improve_steps=TrainingSteps(10000000000), steps_between_evaluation_periods=EnvironmentEpisodes(50), evaluation_steps=EnvironmentEpisodes(5)): super().__init__() self.heatup_steps = EnvironmentSteps(0) self.evaluation_steps = evaluation_steps self.steps_between_evaluation_periods = steps_between_evaluation_periods self.improve_steps = improve_steps
def __init__(self): super().__init__() self.num_predicted_steps_ahead = 6 self.goal_vector = [1.0, 1.0] self.future_measurements_weights = [0.5, 0.5, 1.0] self.use_accumulated_reward_as_measurement = False self.handling_targets_after_episode_end = HandlingTargetsAfterEpisodeEnd.NAN self.scale_measurements_targets = {} self.num_consecutive_playing_steps = EnvironmentSteps(8)
def test_piece_wise_schedule(): # decreasing schedule schedule = PieceWiseSchedule( [(LinearSchedule(1, 3, 10), EnvironmentSteps(5)), (ConstantSchedule(4), EnvironmentSteps(10)), (ExponentialSchedule(3, 1, 0.99), EnvironmentSteps(10)) ] ) target_values = np.append(np.linspace(1, 2, 6), np.ones(11)*4) for i in range(16): assert round(schedule.current_value, 4) == round(target_values[i], 4) schedule.step() current_power = 1 for i in range(10): assert round(schedule.current_value, 4) == round(3*current_power, 4) current_power *= 0.99 schedule.step()
def start_graph(graph_manager: 'GraphManager', task_parameters: 'TaskParameters'): graph_manager.create_graph(task_parameters) # let the adventure begin if task_parameters.evaluate_only: graph_manager.evaluate(EnvironmentSteps(sys.maxsize), keep_networks_in_sync=True) else: graph_manager.improve()
def get_sac_params(agent_params, agent, params, run_type=str(RunType.ROLLOUT_WORKER)): for net_key in ["policy", "v", "q"]: agent_params.network_wrappers[net_key].learning_rate = params[ HyperParameterKeys.LEARNING_RATE.value] agent_params.network_wrappers[ net_key].input_embedders_parameters = create_input_embedder( agent.network_settings["input_embedders"], agent.network_settings["embedder_type"], agent.network_settings["activation_function"], ) # DH: use empty middleware_embedder for q net if net_key != "q": agent_params.network_wrappers[ net_key].middleware_parameters = create_middle_embedder( agent.network_settings["middleware_embedders"], agent.network_settings["embedder_type"], agent.network_settings["activation_function"], ) for net_key in ["policy", "q", "v"]: agent_params.network_wrappers[net_key].batch_size = params[ HyperParameterKeys.BATCH_SIZE.value] agent_params.network_wrappers[net_key].optimizer_epsilon = 1e-5 agent_params.network_wrappers[net_key].adam_optimizer_beta2 = 0.999 if params[HyperParameterKeys.LOSS_TYPE. value] == LossTypes.HUBER.value: agent_params.network_wrappers[ net_key].replace_mse_with_huber_loss = True agent_params.network_wrappers["policy"].heads_parameters[ 0].sac_alpha = params[HyperParameterKeys.SAC_ALPHA.value] # Rescale action values in the policy head agent_params.network_wrappers["policy"].heads_parameters[ 0].rescale_action_values = True agent_params.algorithm.discount = params[ HyperParameterKeys.DISCOUNT_FACTOR.value] # DH: should set num_steps_between_copying_online_weights_to_target as EnvironmentSteps instead of EnvironmentEpisodes. # see agent.py should_copy_online_weight... agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps( params[HyperParameterKeys.NUM_EPISODES_BETWEEN_TRAINING.value]) agent_params.algorithm.distributed_coach_synchronization_type = ( DistributedCoachSynchronizationType.SYNC) # tau=1 agent_params.algorithm.rate_for_copying_weights_to_target = 1 agent_params.algorithm.use_deterministic_for_evaluation = True # DH: ----to address the training worker fetch issue-------------------------- if run_type == str(RunType.TRAINER): agent_params.memory = ExperienceReplayParameters() elif run_type == str(RunType.ROLLOUT_WORKER): agent_params.memory = DeepRacerMemoryParameters( ) # EpisodicExperienceReplayParameters() return agent_params
def __init__(self): super().__init__() self.dnd_size = 500000 self.l2_norm_added_delta = 0.001 self.new_value_shift_coefficient = 0.1 self.number_of_knn = 50 self.DND_key_error_threshold = 0 self.num_consecutive_playing_steps = EnvironmentSteps(4) self.propagate_updates_to_DND = False self.n_step = 100 self.bootstrap_total_return_from_old_policy = True
def start_graph(graph_manager: 'GraphManager', task_parameters: 'TaskParameters'): """ Runs the graph_manager using the configured task_parameters. This stand-alone method is a convenience for multiprocessing. """ graph_manager.create_graph(task_parameters) # let the adventure begin if task_parameters.evaluate_only: graph_manager.evaluate(EnvironmentSteps(sys.maxsize), keep_networks_in_sync=True) else: graph_manager.improve()