예제 #1
0
 def __init__(self):
     super().__init__()
     self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
         10000)
     self.num_consecutive_playing_steps = EnvironmentSteps(4)
     self.discount = 0.99
     self.supports_parameter_noise = True
예제 #2
0
def _validate(graph_manager, task_parameters, transitions, s3_bucket,
              s3_prefix, aws_region):
    checkpoint_dir = task_parameters.checkpoint_restore_path
    wait_for_checkpoint(checkpoint_dir, graph_manager.data_store)

    if utils.do_model_selection(s3_bucket=s3_bucket,
                                s3_prefix=s3_prefix,
                                region=aws_region,
                                checkpoint_type=LAST_CHECKPOINT):
        logger.info(
            "Test Last Checkpoint: %s",
            utils.get_best_checkpoint(s3_bucket, s3_prefix, aws_region))
        graph_manager.create_graph(task_parameters)
        graph_manager.phase = RunPhase.TEST
        graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transitions=transitions)
        logger.info(
            "Test Best Checkpoint: %s",
            utils.get_last_checkpoint(s3_bucket, s3_prefix, aws_region))
        utils.do_model_selection(s3_bucket=s3_bucket,
                                 s3_prefix=s3_prefix,
                                 region=aws_region,
                                 checkpoint_type=BEST_CHECKPOINT)
        graph_manager.data_store.load_from_store()
        graph_manager.restore_checkpoint()
        graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transitions=transitions)
    else:
        logger.info("Test Last Checkpoint")
        graph_manager.create_graph(task_parameters)
        graph_manager.phase = RunPhase.TEST
        graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transitions=transitions)
예제 #3
0
def tournament_worker(graph_manager, number_of_trials, task_parameters, s3_writers, is_continuous):
    """ Tournament worker function

    Arguments:
        graph_manager {[MultiAgentGraphManager]} -- [Graph manager of multiagent graph manager]
        number_of_trials {[int]} -- [Number of trails you want to run the evaluation]
        task_parameters {[TaskParameters]} -- [Information of the checkpoint, gpu/cpu, framework etc of rlcoach]
        s3_writers {[S3Writer]} -- [Information to upload to the S3 bucket all the simtrace and mp4]
        is_continuous {bool} -- [The termination condition for the car]
    """
    checkpoint_dirs = list()
    agent_names = list()
    subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list(), list()
    subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list()
    for agent_param in graph_manager.agents_params:
        _checkpoint_dir = task_parameters.checkpoint_restore_path if len(graph_manager.agents_params) == 1 \
            else os.path.join(task_parameters.checkpoint_restore_path, agent_param.name)
        agent_names.append(agent_param.name)
        checkpoint_dirs.append(_checkpoint_dir)
        racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1 \
            else "racecar_{}".format(agent_param.name.split("_")[1])
        subscribe_to_save_mp4_topic.append("/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name))
        unsubscribe_from_save_mp4_topic.append("/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name))
    wait_for_checkpoints(checkpoint_dirs, graph_manager.data_store)
    modify_checkpoint_variables(checkpoint_dirs, agent_names)

    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics')
    rospy.wait_for_service('/gazebo/unpause_physics')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics', Empty)

    for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic):
        rospy.wait_for_service(mp4_sub)
        rospy.wait_for_service(mp4_unsub)
    for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic):
        subscribe_to_save_mp4.append(ServiceProxyWrapper(mp4_sub, Empty))
        unsubscribe_from_save_mp4.append(ServiceProxyWrapper(mp4_unsub, Empty))

    graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics,
                               start_physics=unpause_physics, empty_service_call=EmptyRequest)
    unpause_physics(EmptyRequest())
    graph_manager.reset_internal_state(True)

    is_save_mp4_enabled = rospy.get_param('MP4_S3_BUCKET', None)
    if is_save_mp4_enabled:
        for subscribe_mp4 in subscribe_to_save_mp4:
            subscribe_mp4(EmptyRequest())
    if is_continuous:
        graph_manager.evaluate(EnvironmentSteps(1))
    else:
        for _ in range(number_of_trials):
            graph_manager.evaluate(EnvironmentSteps(1))
    if is_save_mp4_enabled:
        for unsubscribe_mp4 in unsubscribe_from_save_mp4:
            unsubscribe_mp4(EmptyRequest())
    for s3_writer in s3_writers:
        s3_writer.upload_to_s3()
    time.sleep(1)
    pause_physics(EmptyRequest())
예제 #4
0
def coach_adc(model, dataset, arch, data_loader, validate_fn, save_checkpoint_fn):
    task_parameters = TaskParameters(framework_type="tensorflow",
                                     experiment_path="./experiments/test")
    extra_params = {'save_checkpoint_secs': None,
                    'render': True}
    task_parameters.__dict__.update(extra_params)

    # Create a dictionary of parameters that Coach will handover to CNNEnvironment
    # Once it creates it.
    if True:
        exploration_noise = 0.5
        #exploration_noise = 0.25
        exploitation_decay = 0.996
        graph_manager.env_params.additional_simulator_parameters = {
            'model': model,
            'dataset': dataset,
            'arch': arch,
            'data_loader': data_loader,
            'validate_fn': validate_fn,
            'save_checkpoint_fn': save_checkpoint_fn,
            #'action_range': (0.10, 0.95),
            'action_range': (0.70, 0.95),
            'onehot_encoding': False,
            'normalize_obs': True,
            'desired_reduction': None,
            'reward_fn': lambda top1, top5, vloss, total_macs: -1 * (1-top5/100) * math.log(total_macs)
            #'reward_fn': lambda top1, total_macs: -1 * (1-top1/100) * math.log(total_macs)
            #'reward_fn': lambda top1, total_macs: -1 * max(1-top1/100, 0.25) * math.log(total_macs)
            #'reward_fn': lambda top1, total_macs: -1 * (1-top1/100) * math.log(total_macs/100000)
            #'reward_fn': lambda top1, total_macs:  top1/100 * total_macs/self.dense_model_macs
        }
    else:
        exploration_noise = 0.5
        #exploration_noise = 0.25
        exploitation_decay = 0.996
        graph_manager.env_params.additional_simulator_parameters = {
            'model': model,
            'dataset': dataset,
            'arch': arch,
            'data_loader': data_loader,
            'validate_fn': validate_fn,
            'save_checkpoint_fn': save_checkpoint_fn,
            'action_range': (0.10, 0.95),
            'onehot_encoding': False,
            'normalize_obs': True,
            'desired_reduction': 1.5e8,
            'reward_fn': lambda top1, total_macs: top1/100
            #'reward_fn': lambda top1, total_macs: min(top1/100, 0.75)
        }

    #msglogger.debug('Experiment configuarion:\n' + json.dumps(graph_manager.env_params.additional_simulator_parameters, indent=2))
    steps_per_episode = 13
    agent_params.exploration.noise_percentage_schedule = PieceWiseSchedule([(ConstantSchedule(exploration_noise),
                                                                             EnvironmentSteps(100*steps_per_episode)),
                                                                            (ExponentialSchedule(exploration_noise, 0, exploitation_decay),
                                                                             EnvironmentSteps(300*steps_per_episode))])
    graph_manager.create_graph(task_parameters)
    graph_manager.improve()
예제 #5
0
 def __init__(self):
     super().__init__()
     self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
     self.rate_for_copying_weights_to_target = 0.001
     self.num_consecutive_playing_steps = EnvironmentSteps(1)
     self.use_target_network_for_evaluation = False
     self.action_penalty = 0
     self.clip_critic_targets = None  # expected to be a tuple of the form (min_clip_value, max_clip_value) or None
     self.use_non_zero_discount_for_terminal_states = False
예제 #6
0
 def __init__(self):
     super().__init__()
     self.architecture_num_q_heads = 10
     self.bootstrapped_data_sharing_probability = 1.0
     self.epsilon_schedule = PieceWiseSchedule([
         (LinearSchedule(1, 0.1, 1000000), EnvironmentSteps(1000000)),
         (LinearSchedule(0.1, 0.01, 4000000), EnvironmentSteps(4000000))
     ])
     self.lamb = 0.1
예제 #7
0
def _validate(graph_manager, task_parameters, transitions, s3_bucket,
              s3_prefix, aws_region):
    checkpoint = graph_manager.data_store.params.checkpoint_dict['agent']
    checkpoint_dir = task_parameters.checkpoint_restore_path
    graph_manager.data_store.wait_for_checkpoints()

    # validate last checkpoint
    last_model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_last_checkpoint(
    )
    if checkpoint.rl_coach_checkpoint.update(
            model_checkpoint_name=last_model_checkpoint_name,
            s3_kms_extra_args=utils.get_s3_kms_extra_args()):
        screen.log_title(" Validating Last Checkpoint: {}".format(
            last_model_checkpoint_name))
        # load the last rl coach checkpoint from store
        graph_manager.data_store.load_from_store()
        graph_manager.create_graph(task_parameters)
        graph_manager.phase = RunPhase.TEST
        screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint")
        graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transitions=transitions)
        screen.log_title(
            " emulate_act_on_trainer on Last Checkpoint completed!")
        # validate best checkpoint: Best checkpoint might not exist.
        best_model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint(
        )
        if checkpoint.rl_coach_checkpoint.update(
                model_checkpoint_name=best_model_checkpoint_name,
                s3_kms_extra_args=utils.get_s3_kms_extra_args()):
            screen.log_title(" Validating Best Checkpoint: {}".format(
                best_model_checkpoint_name))
            # load the best rl coach checkpoint from store
            graph_manager.data_store.load_from_store()
            graph_manager.restore_checkpoint()
            screen.log_title(
                " Start emulate_act_on_trainer on Best Checkpoint")
            graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                                 transitions=transitions)
            screen.log_title(
                " emulate_act_on_trainer on Best Checkpoint completed!")
        else:
            screen.log_title(" No Best Checkpoint to validate.")

    else:
        screen.log_title(" Validating Last Checkpoint")
        # load the last rl coach checkpoint from store
        graph_manager.data_store.load_from_store()
        graph_manager.create_graph(task_parameters)
        graph_manager.phase = RunPhase.TEST
        screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint ")
        graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transitions=transitions)
        screen.log_title(
            " Start emulate_act_on_trainer on Last Checkpoint completed!")
    screen.log_title(" Validation completed!")
    def start_race(self):
        """
        Start the race (evaluation) for the current racer.
        """
        LOG.info(
            "[virtual event manager] Starting race for racer %s", self._current_racer.racerAlias
        )
        # update the car on current model if does not use f1 or tron type of shell
        if const.F1 not in self._body_shell_type.lower():
            self._model_updater.update_model_color(
                self._current_car_model_state.model_name, self._current_racer.carConfig.carColor
            )
        # send request
        if self._is_save_mp4_enabled:
            self._subscribe_to_save_mp4(
                VirtualEventVideoEditSrvRequest(
                    display_name=self._current_racer.racerAlias,
                    racecar_color=self._current_racer.carConfig.carColor,
                )
            )

        # Update CameraManager by adding cameras into the current namespace. By doing so
        # a single follow car camera will follow the current active racecar.
        self._camera_manager.add(
            self._main_cameras[VIRTUAL_EVENT], self._current_car_model_state.model_name
        )
        self._camera_manager.add(self._sub_camera, self._current_car_model_state.model_name)

        configure_environment_randomizer()
        # strip index for park position
        self._park_position_idx = get_racecar_idx(self._current_car_model_state.model_name)
        # set the park position in track and do evaluation
        # Before each evaluation episode (single lap for non-continuous race and complete race for
        # continuous race), a new copy of park_positions needs to be loaded into track_data because
        # a park position will be pop from park_positions when a racer car need to be parked.
        # unpause the physics in current world
        self._model_updater.unpause_physics()
        LOG.info("[virtual event manager] Unpaused physics in current world.")
        if (
            self._prev_model_name is not None
            and self._prev_model_name != self._current_car_model_state.model_name
        ):
            # disable the links on the prev car
            # we are doing it here because we don't want the car to float around
            # after the link is disabled
            prev_car_model_state = ModelState()
            prev_car_model_state.model_name = self._prev_model_name
        LOG.info("[virtual event manager] Unpaused model for current car.")
        if self._is_continuous:
            self._track_data.park_positions = [self._park_positions[self._park_position_idx]]
            self._current_graph_manager.evaluate(EnvironmentSteps(1))
        else:
            for _ in range(self._number_of_trials):
                self._track_data.park_positions = [self._park_positions[self._park_position_idx]]
                self._current_graph_manager.evaluate(EnvironmentSteps(1))
예제 #9
0
def coach_adc(model, dataset, arch, optimizer_data, validate_fn, save_checkpoint_fn, train_fn):
    # task_parameters = TaskParameters(framework_type="tensorflow",
    #                                  experiment_path="./experiments/test")
    # extra_params = {'save_checkpoint_secs': None,
    #                 'render': True}
    # task_parameters.__dict__.update(extra_params)
    task_parameters = TaskParameters(experiment_path=logger.get_experiment_path('adc'))
    conv_cnt = count_conv_layer(model)

    # Create a dictionary of parameters that Coach will handover to CNNEnvironment
    # Once it creates it.
    services = distiller.utils.MutableNamedTuple({
                'validate_fn': validate_fn,
                'save_checkpoint_fn': save_checkpoint_fn,
                'train_fn': train_fn})

    app_args = distiller.utils.MutableNamedTuple({
                'dataset': dataset,
                'arch': arch,
                'optimizer_data': optimizer_data})
    if True:
        amc_cfg = distiller.utils.MutableNamedTuple({
                #'action_range': (0.20, 0.95),
                'action_range': (0.20, 0.80),
                'onehot_encoding': False,
                'normalize_obs': True,
                'desired_reduction': None,
                'reward_fn': lambda top1, top5, vloss, total_macs: -1 * (1-top1/100) * math.log(total_macs),
                'conv_cnt': conv_cnt,
                'max_reward': -1000})
    else:
        amc_cfg = distiller.utils.MutableNamedTuple({
                'action_range': (0.10, 0.95),
                'onehot_encoding': False,
                'normalize_obs': True,
                'desired_reduction': 1.5e8,
                'reward_fn': lambda top1, top5, vloss, total_macs: top1/100,
                #'reward_fn': lambda top1, total_macs: min(top1/100, 0.75),
                'conv_cnt': conv_cnt,
                'max_reward': -1000})

    # These parameters are passed to the Distiller environment
    graph_manager.env_params.additional_simulator_parameters = {'model': model,
                                                                'app_args': app_args,
                                                                'amc_cfg': amc_cfg,
                                                                'services': services}
    exploration_noise = 0.5
    exploitation_decay = 0.996
    steps_per_episode = conv_cnt
    agent_params.exploration.noise_percentage_schedule = PieceWiseSchedule([
        (ConstantSchedule(exploration_noise), EnvironmentSteps(100*steps_per_episode)),
        (ExponentialSchedule(exploration_noise, 0, exploitation_decay), EnvironmentSteps(300*steps_per_episode))])
    graph_manager.create_graph(task_parameters)
    graph_manager.improve()
예제 #10
0
 def fetch_from_worker(self, num_consecutive_playing_steps=None):
     if hasattr(self, 'memory_backend'):
         with self.phase_context(RunPhase.TRAIN):
             for transition in self.memory_backend.fetch(
                     num_consecutive_playing_steps):
                 self.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transition)
예제 #11
0
    def __init__(
            self,
            name: str,
            agents: Union['Agent', CompositeAgent,
                          Dict[str, Union['Agent', CompositeAgent]]],
            environment: Union['LevelManager', Environment],
            real_environment: Environment = None,
            steps_limit: EnvironmentSteps = EnvironmentSteps(1),
            should_reset_agent_state_after_time_limit_passes: bool = False,
            spaces_definition: SpacesDefinition = None):
        """
        A level manager controls a single or multiple composite agents and a single environment.
        The environment can be either a real environment or another level manager behaving as an environment.
        :param agents: a list of agents or composite agents to control
        :param environment: an environment or level manager to control
        :param real_environment: the real environment that is is acted upon. if this is None (which it should be for
         the most bottom level), it will be replaced by the environment parameter. For simple RL schemes, where there
         is only a single level of hierarchy, this removes the requirement of defining both the environment and the
         real environment, as they are the same.
        :param steps_limit: the number of time steps to run when stepping the internal components
        :param should_reset_agent_state_after_time_limit_passes: reset the agent after stepping for steps_limit
        :param name: the level's name
        :param spaces_definition: external definition of spaces for when we don't have an environment (e.g. batch-rl)
        """
        super().__init__()

        if not isinstance(agents, dict):
            # insert the single composite agent to a dictionary for compatibility
            agents = {agents.name: agents}
        if real_environment is None:
            self._real_environment = real_environment = environment
        self.agents = agents
        self.environment = environment
        self.real_environment = real_environment
        self.steps_limit = steps_limit
        self.should_reset_agent_state_after_time_limit_passes = should_reset_agent_state_after_time_limit_passes
        self.full_name_id = self.name = name
        self._phase = RunPhase.HEATUP
        self.reset_required = False

        # set self as the parent for all the composite agents
        for agent in self.agents.values():
            agent.parent = self
            agent.parent_level_manager = self

        # create all agents in all composite_agents - we do it here so agents will have access to their level manager
        for agent in self.agents.values():
            if isinstance(agent, CompositeAgent):
                agent.create_agents()

        if not isinstance(self.steps_limit, EnvironmentSteps):
            raise ValueError(
                "The num consecutive steps for acting must be defined in terms of environment steps"
            )
        self.build(spaces_definition)

        # there are cases where we don't have an environment. e.g. in batch-rl or in imitation learning.
        self.last_env_response = self.real_environment.last_env_response if self.real_environment else None

        self.parent_graph_manager = None
def evaluation_worker(graph_manager, data_store, number_of_trials,
                      task_parameters):
    checkpoint_dir = task_parameters.checkpoint_restore_path
    wait_for_checkpoint(checkpoint_dir, data_store)
    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics')
    rospy.wait_for_service('/gazebo/unpause_physics')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics', Empty)
    graph_manager.create_graph(task_parameters=task_parameters,
                               stop_physics=pause_physics,
                               start_physics=unpause_physics,
                               empty_service_call=EmptyRequest)

    # Instantiate Cameras
    configure_camera()

    unpause_physics(EmptyRequest())
    graph_manager.reset_internal_state(True)
    for _ in range(number_of_trials):
        graph_manager.evaluate(EnvironmentSteps(1))

    # Close the down the job
    utils.cancel_simulation_job(
        os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
        rospy.get_param('AWS_REGION'))
예제 #13
0
    def _create_graph(self, task_parameters: TaskParameters) -> Tuple[List[LevelManager], List[Environment]]:
        env = short_dynamic_import(self.env_params.path)(**self.env_params.__dict__,
                                                         visualization_parameters=self.visualization_parameters)

        for agent_params in self.agents_params:
            agent_params.task_parameters = task_parameters

        # we need to build the hierarchy in reverse order (from the bottom up) in order for the spaces of each level
        # to be known
        level_managers = []
        current_env = env
        # out_action_space = env.action_space
        for level_idx, agent_params in reversed(list(enumerate(self.agents_params))):
            agent_params.name = "agent_{}".format(level_idx)
            agent_params.is_a_highest_level_agent = level_idx == 0
            agent_params.is_a_lowest_level_agent = level_idx == len(self.agents_params) - 1

            agent = short_dynamic_import(agent_params.path)(agent_params)

            level_manager = LevelManager(
                agents=agent,
                environment=current_env,
                real_environment=env,
                steps_limit=EnvironmentSteps(1) if level_idx == 0
                            else self.consecutive_steps_to_run_non_top_levels,
                should_reset_agent_state_after_time_limit_passes=level_idx > 0,
                name="level_{}".format(level_idx)
            )
            current_env = level_manager
            level_managers.insert(0, level_manager)

        return level_managers, [env]
예제 #14
0
def evaluate(params):
    # file params
    experiment_path = os.path.join(params.output_data_dir)
    logger.experiment_path = os.path.join(experiment_path, 'evaluation')
    params.checkpoint_restore_dir = os.path.join(params.input_data_dir,
                                                 'checkpoint')
    checkpoint_file = os.path.join(params.checkpoint_restore_dir, 'checkpoint')

    inplace_change(checkpoint_file, "/opt/ml/output/data/checkpoint", ".")
    # Note that due to a tensorflow issue (https://github.com/tensorflow/tensorflow/issues/9146) we need to replace
    # the absolute path for the evaluation-from-a-checkpointed-model to work

    vis_params = VisualizationParameters()
    vis_params.dump_gifs = True

    task_params = TaskParameters(evaluate_only=True,
                                 experiment_path=logger.experiment_path)
    task_params.__dict__ = add_items_to_dict(task_params.__dict__,
                                             params.__dict__)

    graph_manager = BasicRLGraphManager(
        agent_params=ClippedPPOAgentParameters(),
        env_params=GymVectorEnvironment(level='TSP_env:TSPEasyEnv'),
        schedule_params=ScheduleParameters(),
        vis_params=vis_params)
    graph_manager = graph_manager.create_graph(task_parameters=task_params)
    graph_manager.evaluate(EnvironmentSteps(5))
예제 #15
0
 def __init__(self):
     super().__init__()
     self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
         1)
     self.rate_for_copying_weights_to_target = 0.005
     # evaluate agent using deterministic policy (i.e. take the mean value)
     self.use_deterministic_for_evaluation = True
예제 #16
0
    def __init__(self):
        # Architecture parameters
        self.use_accumulated_reward_as_measurement = False

        # Agent parameters
        self.num_consecutive_playing_steps = EnvironmentSteps(1)
        self.num_consecutive_training_steps = 1  # TODO: update this to TrainingSteps

        self.heatup_using_network_decisions = False
        self.discount = 0.99
        self.apply_gradients_every_x_episodes = 5
        self.num_steps_between_copying_online_weights_to_target = TrainingSteps(
            0)
        self.rate_for_copying_weights_to_target = 1.0
        self.load_memory_from_file_path = None
        self.store_transitions_only_when_episodes_are_terminated = False

        # HRL / HER related params
        self.in_action_space = None

        # distributed agents params
        self.share_statistics_between_workers = True

        # intrinsic reward
        self.scale_external_reward_by_intrinsic_reward_value = False

        # n-step returns
        self.n_step = -1  # calculate the total return (no bootstrap, by default)

        # Distributed Coach params
        self.distributed_coach_synchronization_type = None

        # Should the workers wait for full episode
        self.act_for_full_episodes = False
예제 #17
0
    def __init__(self):
        # Architecture parameters
        self.use_accumulated_reward_as_measurement = False

        # Agent parameters
        self.num_consecutive_playing_steps = EnvironmentSteps(1)
        self.num_consecutive_training_steps = 1  # TODO: update this to TrainingSteps

        self.heatup_using_network_decisions = False
        self.discount = 0.99
        self.apply_gradients_every_x_episodes = 5
        self.num_steps_between_copying_online_weights_to_target = TrainingSteps(
            0)
        self.rate_for_copying_weights_to_target = 1.0
        self.load_memory_from_file_path = None
        self.collect_new_data = True
        self.store_transitions_only_when_episodes_are_terminated = False

        # HRL / HER related params
        self.in_action_space = None

        # distributed agents params
        self.share_statistics_between_workers = True

        # intrinsic reward
        self.scale_external_reward_by_intrinsic_reward_value = False
예제 #18
0
 def fetch_from_worker(self, num_consecutive_playing_steps=None):
     if hasattr(self, 'memory_backend'):
         for transitions in self.memory_backend.fetch(
                 num_consecutive_playing_steps):
             self.emulate_act_on_trainer(EnvironmentSteps(1), transitions)
             if hasattr(self, 'sample_collector'):
                 self.sample_collector.sample(transitions)
 def start_single_threaded(self, task_parameters, graph_manager, args):
     """Override to use custom evaluate_steps, instead of infinite steps. Just evaluate.
     """
     graph_manager.agent_params.visualization.dump_csv = False # issues with CSV export in evaluation only
     graph_manager.create_graph(task_parameters)
     graph_manager.evaluate(EnvironmentSteps(args.evaluate_steps))
     graph_manager.close()
예제 #20
0
 def __init__(self):
     super().__init__()
     self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
         10000)
     self.apply_gradients_every_x_episodes = 1
     self.num_steps_between_gradient_updates = 5  # this is called t_max in all the papers
     self.targets_horizon = 'N-Step'
예제 #21
0
def rollout_worker(graph_manager, data_store, num_workers, task_parameters):
    """
    wait for first checkpoint then perform rollouts using the model
    """
    if not data_store:
        raise AttributeError("None type for data_store object")

    checkpoint_dir = task_parameters.checkpoint_restore_path
    wait_for_checkpoint(checkpoint_dir, data_store)
    wait_for_trainer_ready(checkpoint_dir, data_store)
    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics')
    rospy.wait_for_service('/gazebo/unpause_physics')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics', Empty)
    graph_manager.create_graph(task_parameters=task_parameters,
                               stop_physics=pause_physics,
                               start_physics=unpause_physics,
                               empty_service_call=EmptyRequest)

    with graph_manager.phase_context(RunPhase.TRAIN):
        chkpt_state_reader = CheckpointStateReader(
            checkpoint_dir, checkpoint_state_optional=False)
        last_checkpoint = chkpt_state_reader.get_latest().num

        for level in graph_manager.level_managers:
            for agent in level.agents.values():
                agent.memory.memory_backend.set_current_checkpoint(
                    last_checkpoint)

        # this worker should play a fraction of the total playing steps per rollout
        act_steps = 1
        while True:
            exit_if_trainer_done(checkpoint_dir)
            unpause_physics(EmptyRequest())
            graph_manager.reset_internal_state(True)
            graph_manager.act(EnvironmentSteps(num_steps=act_steps),
                              wait_for_full_episodes=graph_manager.
                              agent_params.algorithm.act_for_full_episodes)
            graph_manager.reset_internal_state(True)
            time.sleep(1)
            pause_physics(EmptyRequest())

            new_checkpoint = data_store.get_latest_checkpoint()
            if new_checkpoint and new_checkpoint > last_checkpoint:
                if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.SYNC:
                    exit_if_trainer_done(checkpoint_dir)
                    data_store.load_from_store(
                        expected_checkpoint_number=last_checkpoint + 1)
                    graph_manager.restore_checkpoint()

                if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.ASYNC:
                    graph_manager.restore_checkpoint()

                last_checkpoint = new_checkpoint
                for level in graph_manager.level_managers:
                    for agent in level.agents.values():
                        agent.memory.memory_backend.set_current_checkpoint(
                            last_checkpoint)
예제 #22
0
 def __init__(self):
     super().__init__()
     self.v_min = -10.0
     self.v_max = 10.0
     self.atoms = 51
     self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
         32000 // 4)  # 32k frames
     self.n_steps = 3
예제 #23
0
def _validate(graph_manager, task_parameters, transitions, s3_bucket,
              s3_prefix, aws_region):
    checkpoint_dir = task_parameters.checkpoint_restore_path
    wait_for_checkpoint(checkpoint_dir, graph_manager.data_store)

    if utils.do_model_selection(s3_bucket=s3_bucket,
                                s3_prefix=s3_prefix,
                                region=aws_region,
                                checkpoint_type=LAST_CHECKPOINT):
        screen.log_title(" Validating Last Checkpoint: {}".format(
            utils.get_last_checkpoint(s3_bucket, s3_prefix, aws_region)))
        graph_manager.create_graph(task_parameters)
        graph_manager.phase = RunPhase.TEST
        screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint")
        graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transitions=transitions)
        screen.log_title(
            " emulate_act_on_trainer on Last Checkpoint completed!")
        # Best checkpoint might not exist.
        if utils.do_model_selection(s3_bucket=s3_bucket,
                                    s3_prefix=s3_prefix,
                                    region=aws_region,
                                    checkpoint_type=BEST_CHECKPOINT):
            screen.log_title(" Validating Best Checkpoint: {}".format(
                utils.get_best_checkpoint(s3_bucket, s3_prefix, aws_region)))
            graph_manager.data_store.load_from_store()
            graph_manager.restore_checkpoint()
            screen.log_title(
                " Start emulate_act_on_trainer on Best Checkpoint")
            graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                                 transitions=transitions)
            screen.log_title(
                " emulate_act_on_trainer on Best Checkpoint completed!")
        else:
            screen.log_title(" No Best Checkpoint to validate.")

    else:
        screen.log_title(" Validating Last Checkpoint")
        graph_manager.create_graph(task_parameters)
        graph_manager.phase = RunPhase.TEST
        screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint ")
        graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transitions=transitions)
        screen.log_title(
            " Start emulate_act_on_trainer on Last Checkpoint completed!")
    screen.log_title(" Validation completed!")
예제 #24
0
 def __init__(self,
              improve_steps=TrainingSteps(10000000000),
              steps_between_evaluation_periods=EnvironmentEpisodes(50),
              evaluation_steps=EnvironmentEpisodes(5)):
     super().__init__()
     self.heatup_steps = EnvironmentSteps(0)
     self.evaluation_steps = evaluation_steps
     self.steps_between_evaluation_periods = steps_between_evaluation_periods
     self.improve_steps = improve_steps
예제 #25
0
 def __init__(self):
     super().__init__()
     self.num_predicted_steps_ahead = 6
     self.goal_vector = [1.0, 1.0]
     self.future_measurements_weights = [0.5, 0.5, 1.0]
     self.use_accumulated_reward_as_measurement = False
     self.handling_targets_after_episode_end = HandlingTargetsAfterEpisodeEnd.NAN
     self.scale_measurements_targets = {}
     self.num_consecutive_playing_steps = EnvironmentSteps(8)
def test_piece_wise_schedule():
    # decreasing schedule
    schedule = PieceWiseSchedule(
        [(LinearSchedule(1, 3, 10), EnvironmentSteps(5)),
         (ConstantSchedule(4), EnvironmentSteps(10)),
         (ExponentialSchedule(3, 1, 0.99), EnvironmentSteps(10))
         ]
    )

    target_values = np.append(np.linspace(1, 2, 6), np.ones(11)*4)
    for i in range(16):
        assert round(schedule.current_value, 4) == round(target_values[i], 4)
        schedule.step()

    current_power = 1
    for i in range(10):
        assert round(schedule.current_value, 4) == round(3*current_power, 4)
        current_power *= 0.99
        schedule.step()
예제 #27
0
def start_graph(graph_manager: 'GraphManager',
                task_parameters: 'TaskParameters'):
    graph_manager.create_graph(task_parameters)

    # let the adventure begin
    if task_parameters.evaluate_only:
        graph_manager.evaluate(EnvironmentSteps(sys.maxsize),
                               keep_networks_in_sync=True)
    else:
        graph_manager.improve()
def get_sac_params(agent_params,
                   agent,
                   params,
                   run_type=str(RunType.ROLLOUT_WORKER)):
    for net_key in ["policy", "v", "q"]:
        agent_params.network_wrappers[net_key].learning_rate = params[
            HyperParameterKeys.LEARNING_RATE.value]
        agent_params.network_wrappers[
            net_key].input_embedders_parameters = create_input_embedder(
                agent.network_settings["input_embedders"],
                agent.network_settings["embedder_type"],
                agent.network_settings["activation_function"],
            )
        # DH: use empty middleware_embedder for q net
        if net_key != "q":
            agent_params.network_wrappers[
                net_key].middleware_parameters = create_middle_embedder(
                    agent.network_settings["middleware_embedders"],
                    agent.network_settings["embedder_type"],
                    agent.network_settings["activation_function"],
                )

        for net_key in ["policy", "q", "v"]:
            agent_params.network_wrappers[net_key].batch_size = params[
                HyperParameterKeys.BATCH_SIZE.value]
            agent_params.network_wrappers[net_key].optimizer_epsilon = 1e-5
            agent_params.network_wrappers[net_key].adam_optimizer_beta2 = 0.999
            if params[HyperParameterKeys.LOSS_TYPE.
                      value] == LossTypes.HUBER.value:
                agent_params.network_wrappers[
                    net_key].replace_mse_with_huber_loss = True
    agent_params.network_wrappers["policy"].heads_parameters[
        0].sac_alpha = params[HyperParameterKeys.SAC_ALPHA.value]
    # Rescale action values in the policy head
    agent_params.network_wrappers["policy"].heads_parameters[
        0].rescale_action_values = True
    agent_params.algorithm.discount = params[
        HyperParameterKeys.DISCOUNT_FACTOR.value]
    # DH: should set num_steps_between_copying_online_weights_to_target as EnvironmentSteps instead of EnvironmentEpisodes.
    # see agent.py should_copy_online_weight...
    agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
        params[HyperParameterKeys.NUM_EPISODES_BETWEEN_TRAINING.value])
    agent_params.algorithm.distributed_coach_synchronization_type = (
        DistributedCoachSynchronizationType.SYNC)
    # tau=1
    agent_params.algorithm.rate_for_copying_weights_to_target = 1
    agent_params.algorithm.use_deterministic_for_evaluation = True

    # DH: ----to address the training worker fetch issue--------------------------
    if run_type == str(RunType.TRAINER):
        agent_params.memory = ExperienceReplayParameters()
    elif run_type == str(RunType.ROLLOUT_WORKER):
        agent_params.memory = DeepRacerMemoryParameters(
        )  # EpisodicExperienceReplayParameters()
    return agent_params
예제 #29
0
 def __init__(self):
     super().__init__()
     self.dnd_size = 500000
     self.l2_norm_added_delta = 0.001
     self.new_value_shift_coefficient = 0.1
     self.number_of_knn = 50
     self.DND_key_error_threshold = 0
     self.num_consecutive_playing_steps = EnvironmentSteps(4)
     self.propagate_updates_to_DND = False
     self.n_step = 100
     self.bootstrap_total_return_from_old_policy = True
예제 #30
0
def start_graph(graph_manager: 'GraphManager', task_parameters: 'TaskParameters'):
    """
    Runs the graph_manager using the configured task_parameters.
    This stand-alone method is a convenience for multiprocessing.
    """
    graph_manager.create_graph(task_parameters)

    # let the adventure begin
    if task_parameters.evaluate_only:
        graph_manager.evaluate(EnvironmentSteps(sys.maxsize), keep_networks_in_sync=True)
    else:
        graph_manager.improve()