예제 #1
0
 def __init__(self):
     super().__init__()
     self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
         1)
     self.rate_for_copying_weights_to_target = 0.001
     self.num_consecutive_playing_steps = EnvironmentSteps(1)
     self.use_target_network_for_evaluation = False
     self.action_penalty = 0
     self.clip_critic_targets = None  # expected to be a tuple of the form (min_clip_value, max_clip_value) or None
     self.use_non_zero_discount_for_terminal_states = False
예제 #2
0
def _validate(graph_manager, task_parameters, transitions, s3_bucket,
              s3_prefix, aws_region):
    checkpoint = graph_manager.data_store.params.checkpoint_dict['agent']
    checkpoint_dir = task_parameters.checkpoint_restore_path
    graph_manager.data_store.wait_for_checkpoints()

    # validate last checkpoint
    last_model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_last_checkpoint(
    )
    if checkpoint.rl_coach_checkpoint.update(
            model_checkpoint_name=last_model_checkpoint_name,
            s3_kms_extra_args=utils.get_s3_kms_extra_args()):
        screen.log_title(" Validating Last Checkpoint: {}".format(
            last_model_checkpoint_name))
        # load the last rl coach checkpoint from store
        graph_manager.data_store.load_from_store()
        graph_manager.create_graph(task_parameters)
        graph_manager.phase = RunPhase.TEST
        screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint")
        graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transitions=transitions)
        screen.log_title(
            " emulate_act_on_trainer on Last Checkpoint completed!")
        # validate best checkpoint: Best checkpoint might not exist.
        best_model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint(
        )
        if checkpoint.rl_coach_checkpoint.update(
                model_checkpoint_name=best_model_checkpoint_name,
                s3_kms_extra_args=utils.get_s3_kms_extra_args()):
            screen.log_title(" Validating Best Checkpoint: {}".format(
                best_model_checkpoint_name))
            # load the best rl coach checkpoint from store
            graph_manager.data_store.load_from_store()
            graph_manager.restore_checkpoint()
            screen.log_title(
                " Start emulate_act_on_trainer on Best Checkpoint")
            graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                                 transitions=transitions)
            screen.log_title(
                " emulate_act_on_trainer on Best Checkpoint completed!")
        else:
            screen.log_title(" No Best Checkpoint to validate.")

    else:
        screen.log_title(" Validating Last Checkpoint")
        # load the last rl coach checkpoint from store
        graph_manager.data_store.load_from_store()
        graph_manager.create_graph(task_parameters)
        graph_manager.phase = RunPhase.TEST
        screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint ")
        graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transitions=transitions)
        screen.log_title(
            " Start emulate_act_on_trainer on Last Checkpoint completed!")
    screen.log_title(" Validation completed!")
    def start_race(self):
        """
        Start the race (evaluation) for the current racer.
        """
        LOG.info(
            "[virtual event manager] Starting race for racer %s", self._current_racer.racerAlias
        )
        # update the car on current model if does not use f1 or tron type of shell
        if const.F1 not in self._body_shell_type.lower():
            self._model_updater.update_model_color(
                self._current_car_model_state.model_name, self._current_racer.carConfig.carColor
            )
        # send request
        if self._is_save_mp4_enabled:
            self._subscribe_to_save_mp4(
                VirtualEventVideoEditSrvRequest(
                    display_name=self._current_racer.racerAlias,
                    racecar_color=self._current_racer.carConfig.carColor,
                )
            )

        # Update CameraManager by adding cameras into the current namespace. By doing so
        # a single follow car camera will follow the current active racecar.
        self._camera_manager.add(
            self._main_cameras[VIRTUAL_EVENT], self._current_car_model_state.model_name
        )
        self._camera_manager.add(self._sub_camera, self._current_car_model_state.model_name)

        configure_environment_randomizer()
        # strip index for park position
        self._park_position_idx = get_racecar_idx(self._current_car_model_state.model_name)
        # set the park position in track and do evaluation
        # Before each evaluation episode (single lap for non-continuous race and complete race for
        # continuous race), a new copy of park_positions needs to be loaded into track_data because
        # a park position will be pop from park_positions when a racer car need to be parked.
        # unpause the physics in current world
        self._model_updater.unpause_physics()
        LOG.info("[virtual event manager] Unpaused physics in current world.")
        if (
            self._prev_model_name is not None
            and self._prev_model_name != self._current_car_model_state.model_name
        ):
            # disable the links on the prev car
            # we are doing it here because we don't want the car to float around
            # after the link is disabled
            prev_car_model_state = ModelState()
            prev_car_model_state.model_name = self._prev_model_name
        LOG.info("[virtual event manager] Unpaused model for current car.")
        if self._is_continuous:
            self._track_data.park_positions = [self._park_positions[self._park_position_idx]]
            self._current_graph_manager.evaluate(EnvironmentSteps(1))
        else:
            for _ in range(self._number_of_trials):
                self._track_data.park_positions = [self._park_positions[self._park_position_idx]]
                self._current_graph_manager.evaluate(EnvironmentSteps(1))
예제 #4
0
def coach_adc(model, dataset, arch, optimizer_data, validate_fn, save_checkpoint_fn, train_fn):
    # task_parameters = TaskParameters(framework_type="tensorflow",
    #                                  experiment_path="./experiments/test")
    # extra_params = {'save_checkpoint_secs': None,
    #                 'render': True}
    # task_parameters.__dict__.update(extra_params)
    task_parameters = TaskParameters(experiment_path=logger.get_experiment_path('adc'))
    conv_cnt = count_conv_layer(model)

    # Create a dictionary of parameters that Coach will handover to CNNEnvironment
    # Once it creates it.
    services = distiller.utils.MutableNamedTuple({
                'validate_fn': validate_fn,
                'save_checkpoint_fn': save_checkpoint_fn,
                'train_fn': train_fn})

    app_args = distiller.utils.MutableNamedTuple({
                'dataset': dataset,
                'arch': arch,
                'optimizer_data': optimizer_data})
    if True:
        amc_cfg = distiller.utils.MutableNamedTuple({
                #'action_range': (0.20, 0.95),
                'action_range': (0.20, 0.80),
                'onehot_encoding': False,
                'normalize_obs': True,
                'desired_reduction': None,
                'reward_fn': lambda top1, top5, vloss, total_macs: -1 * (1-top1/100) * math.log(total_macs),
                'conv_cnt': conv_cnt,
                'max_reward': -1000})
    else:
        amc_cfg = distiller.utils.MutableNamedTuple({
                'action_range': (0.10, 0.95),
                'onehot_encoding': False,
                'normalize_obs': True,
                'desired_reduction': 1.5e8,
                'reward_fn': lambda top1, top5, vloss, total_macs: top1/100,
                #'reward_fn': lambda top1, total_macs: min(top1/100, 0.75),
                'conv_cnt': conv_cnt,
                'max_reward': -1000})

    # These parameters are passed to the Distiller environment
    graph_manager.env_params.additional_simulator_parameters = {'model': model,
                                                                'app_args': app_args,
                                                                'amc_cfg': amc_cfg,
                                                                'services': services}
    exploration_noise = 0.5
    exploitation_decay = 0.996
    steps_per_episode = conv_cnt
    agent_params.exploration.noise_percentage_schedule = PieceWiseSchedule([
        (ConstantSchedule(exploration_noise), EnvironmentSteps(100*steps_per_episode)),
        (ExponentialSchedule(exploration_noise, 0, exploitation_decay), EnvironmentSteps(300*steps_per_episode))])
    graph_manager.create_graph(task_parameters)
    graph_manager.improve()
예제 #5
0
    def _create_graph(self, task_parameters: TaskParameters) -> Tuple[List[LevelManager], List[Environment]]:
        env = short_dynamic_import(self.env_params.path)(**self.env_params.__dict__,
                                                         visualization_parameters=self.visualization_parameters)

        for agent_params in self.agents_params:
            agent_params.task_parameters = task_parameters

        # we need to build the hierarchy in reverse order (from the bottom up) in order for the spaces of each level
        # to be known
        level_managers = []
        current_env = env
        # out_action_space = env.action_space
        for level_idx, agent_params in reversed(list(enumerate(self.agents_params))):
            agent_params.name = "agent_{}".format(level_idx)
            agent_params.is_a_highest_level_agent = level_idx == 0
            agent_params.is_a_lowest_level_agent = level_idx == len(self.agents_params) - 1

            agent = short_dynamic_import(agent_params.path)(agent_params)

            level_manager = LevelManager(
                agents=agent,
                environment=current_env,
                real_environment=env,
                steps_limit=EnvironmentSteps(1) if level_idx == 0
                            else self.consecutive_steps_to_run_non_top_levels,
                should_reset_agent_state_after_time_limit_passes=level_idx > 0,
                name="level_{}".format(level_idx)
            )
            current_env = level_manager
            level_managers.insert(0, level_manager)

        return level_managers, [env]
예제 #6
0
 def fetch_from_worker(self, num_consecutive_playing_steps=None):
     if hasattr(self, 'memory_backend'):
         for transitions in self.memory_backend.fetch(
                 num_consecutive_playing_steps):
             self.emulate_act_on_trainer(EnvironmentSteps(1), transitions)
             if hasattr(self, 'sample_collector'):
                 self.sample_collector.sample(transitions)
def evaluate(params):
    # file params
    experiment_path = os.path.join(params.output_data_dir)
    logger.experiment_path = os.path.join(experiment_path, 'evaluation')
    params.checkpoint_restore_dir = os.path.join(params.input_data_dir,
                                                 'checkpoint')
    checkpoint_file = os.path.join(params.checkpoint_restore_dir, 'checkpoint')

    inplace_change(checkpoint_file, "/opt/ml/output/data/checkpoint", ".")
    # Note that due to a tensorflow issue (https://github.com/tensorflow/tensorflow/issues/9146) we need to replace
    # the absolute path for the evaluation-from-a-checkpointed-model to work

    vis_params = VisualizationParameters()
    vis_params.dump_gifs = True

    task_params = TaskParameters(evaluate_only=True,
                                 experiment_path=logger.experiment_path)
    task_params.__dict__ = add_items_to_dict(task_params.__dict__,
                                             params.__dict__)

    graph_manager = BasicRLGraphManager(
        agent_params=ClippedPPOAgentParameters(),
        env_params=GymVectorEnvironment(level='TSP_env:TSPEasyEnv'),
        schedule_params=ScheduleParameters(),
        vis_params=vis_params)
    graph_manager = graph_manager.create_graph(task_parameters=task_params)
    graph_manager.evaluate(EnvironmentSteps(5))
예제 #8
0
 def __init__(self):
     super().__init__()
     self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
         1)
     self.rate_for_copying_weights_to_target = 0.005
     # evaluate agent using deterministic policy (i.e. take the mean value)
     self.use_deterministic_for_evaluation = True
 def start_single_threaded(self, task_parameters, graph_manager, args):
     """Override to use custom evaluate_steps, instead of infinite steps. Just evaluate.
     """
     graph_manager.agent_params.visualization.dump_csv = False # issues with CSV export in evaluation only
     graph_manager.create_graph(task_parameters)
     graph_manager.evaluate(EnvironmentSteps(args.evaluate_steps))
     graph_manager.close()
예제 #10
0
 def __init__(self):
     super().__init__()
     self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
         10000)
     self.apply_gradients_every_x_episodes = 1
     self.num_steps_between_gradient_updates = 5  # this is called t_max in all the papers
     self.targets_horizon = 'N-Step'
예제 #11
0
    def __init__(self):
        # Architecture parameters
        self.use_accumulated_reward_as_measurement = False

        # Agent parameters
        self.num_consecutive_playing_steps = EnvironmentSteps(1)
        self.num_consecutive_training_steps = 1  # TODO: update this to TrainingSteps

        self.heatup_using_network_decisions = False
        self.discount = 0.99
        self.apply_gradients_every_x_episodes = 5
        self.num_steps_between_copying_online_weights_to_target = TrainingSteps(
            0)
        self.rate_for_copying_weights_to_target = 1.0
        self.load_memory_from_file_path = None
        self.collect_new_data = True
        self.store_transitions_only_when_episodes_are_terminated = False

        # HRL / HER related params
        self.in_action_space = None

        # distributed agents params
        self.share_statistics_between_workers = True

        # intrinsic reward
        self.scale_external_reward_by_intrinsic_reward_value = False
예제 #12
0
 def fetch_from_worker(self, num_consecutive_playing_steps=None):
     if hasattr(self, 'memory_backend'):
         with self.phase_context(RunPhase.TRAIN):
             for transition in self.memory_backend.fetch(
                     num_consecutive_playing_steps):
                 self.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transition)
예제 #13
0
    def __init__(self):
        # Architecture parameters
        self.use_accumulated_reward_as_measurement = False

        # Agent parameters
        self.num_consecutive_playing_steps = EnvironmentSteps(1)
        self.num_consecutive_training_steps = 1  # TODO: update this to TrainingSteps

        self.heatup_using_network_decisions = False
        self.discount = 0.99
        self.apply_gradients_every_x_episodes = 5
        self.num_steps_between_copying_online_weights_to_target = TrainingSteps(
            0)
        self.rate_for_copying_weights_to_target = 1.0
        self.load_memory_from_file_path = None
        self.store_transitions_only_when_episodes_are_terminated = False

        # HRL / HER related params
        self.in_action_space = None

        # distributed agents params
        self.share_statistics_between_workers = True

        # intrinsic reward
        self.scale_external_reward_by_intrinsic_reward_value = False

        # n-step returns
        self.n_step = -1  # calculate the total return (no bootstrap, by default)

        # Distributed Coach params
        self.distributed_coach_synchronization_type = None

        # Should the workers wait for full episode
        self.act_for_full_episodes = False
예제 #14
0
    def __init__(
            self,
            name: str,
            agents: Union['Agent', CompositeAgent,
                          Dict[str, Union['Agent', CompositeAgent]]],
            environment: Union['LevelManager', Environment],
            real_environment: Environment = None,
            steps_limit: EnvironmentSteps = EnvironmentSteps(1),
            should_reset_agent_state_after_time_limit_passes: bool = False,
            spaces_definition: SpacesDefinition = None):
        """
        A level manager controls a single or multiple composite agents and a single environment.
        The environment can be either a real environment or another level manager behaving as an environment.
        :param agents: a list of agents or composite agents to control
        :param environment: an environment or level manager to control
        :param real_environment: the real environment that is is acted upon. if this is None (which it should be for
         the most bottom level), it will be replaced by the environment parameter. For simple RL schemes, where there
         is only a single level of hierarchy, this removes the requirement of defining both the environment and the
         real environment, as they are the same.
        :param steps_limit: the number of time steps to run when stepping the internal components
        :param should_reset_agent_state_after_time_limit_passes: reset the agent after stepping for steps_limit
        :param name: the level's name
        :param spaces_definition: external definition of spaces for when we don't have an environment (e.g. batch-rl)
        """
        super().__init__()

        if not isinstance(agents, dict):
            # insert the single composite agent to a dictionary for compatibility
            agents = {agents.name: agents}
        if real_environment is None:
            self._real_environment = real_environment = environment
        self.agents = agents
        self.environment = environment
        self.real_environment = real_environment
        self.steps_limit = steps_limit
        self.should_reset_agent_state_after_time_limit_passes = should_reset_agent_state_after_time_limit_passes
        self.full_name_id = self.name = name
        self._phase = RunPhase.HEATUP
        self.reset_required = False

        # set self as the parent for all the composite agents
        for agent in self.agents.values():
            agent.parent = self
            agent.parent_level_manager = self

        # create all agents in all composite_agents - we do it here so agents will have access to their level manager
        for agent in self.agents.values():
            if isinstance(agent, CompositeAgent):
                agent.create_agents()

        if not isinstance(self.steps_limit, EnvironmentSteps):
            raise ValueError(
                "The num consecutive steps for acting must be defined in terms of environment steps"
            )
        self.build(spaces_definition)

        # there are cases where we don't have an environment. e.g. in batch-rl or in imitation learning.
        self.last_env_response = self.real_environment.last_env_response if self.real_environment else None

        self.parent_graph_manager = None
def evaluation_worker(graph_manager, data_store, number_of_trials,
                      task_parameters):
    checkpoint_dir = task_parameters.checkpoint_restore_path
    wait_for_checkpoint(checkpoint_dir, data_store)
    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics')
    rospy.wait_for_service('/gazebo/unpause_physics')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics', Empty)
    graph_manager.create_graph(task_parameters=task_parameters,
                               stop_physics=pause_physics,
                               start_physics=unpause_physics,
                               empty_service_call=EmptyRequest)

    # Instantiate Cameras
    configure_camera()

    unpause_physics(EmptyRequest())
    graph_manager.reset_internal_state(True)
    for _ in range(number_of_trials):
        graph_manager.evaluate(EnvironmentSteps(1))

    # Close the down the job
    utils.cancel_simulation_job(
        os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
        rospy.get_param('AWS_REGION'))
예제 #16
0
def rollout_worker(graph_manager, data_store, num_workers, task_parameters):
    """
    wait for first checkpoint then perform rollouts using the model
    """
    if not data_store:
        raise AttributeError("None type for data_store object")

    checkpoint_dir = task_parameters.checkpoint_restore_path
    wait_for_checkpoint(checkpoint_dir, data_store)
    wait_for_trainer_ready(checkpoint_dir, data_store)
    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics')
    rospy.wait_for_service('/gazebo/unpause_physics')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics', Empty)
    graph_manager.create_graph(task_parameters=task_parameters,
                               stop_physics=pause_physics,
                               start_physics=unpause_physics,
                               empty_service_call=EmptyRequest)

    with graph_manager.phase_context(RunPhase.TRAIN):
        chkpt_state_reader = CheckpointStateReader(
            checkpoint_dir, checkpoint_state_optional=False)
        last_checkpoint = chkpt_state_reader.get_latest().num

        for level in graph_manager.level_managers:
            for agent in level.agents.values():
                agent.memory.memory_backend.set_current_checkpoint(
                    last_checkpoint)

        # this worker should play a fraction of the total playing steps per rollout
        act_steps = 1
        while True:
            exit_if_trainer_done(checkpoint_dir)
            unpause_physics(EmptyRequest())
            graph_manager.reset_internal_state(True)
            graph_manager.act(EnvironmentSteps(num_steps=act_steps),
                              wait_for_full_episodes=graph_manager.
                              agent_params.algorithm.act_for_full_episodes)
            graph_manager.reset_internal_state(True)
            time.sleep(1)
            pause_physics(EmptyRequest())

            new_checkpoint = data_store.get_latest_checkpoint()
            if new_checkpoint and new_checkpoint > last_checkpoint:
                if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.SYNC:
                    exit_if_trainer_done(checkpoint_dir)
                    data_store.load_from_store(
                        expected_checkpoint_number=last_checkpoint + 1)
                    graph_manager.restore_checkpoint()

                if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.ASYNC:
                    graph_manager.restore_checkpoint()

                last_checkpoint = new_checkpoint
                for level in graph_manager.level_managers:
                    for agent in level.agents.values():
                        agent.memory.memory_backend.set_current_checkpoint(
                            last_checkpoint)
예제 #17
0
 def __init__(self):
     super().__init__()
     self.v_min = -10.0
     self.v_max = 10.0
     self.atoms = 51
     self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
         32000 // 4)  # 32k frames
     self.n_steps = 3
예제 #18
0
def _validate(graph_manager, task_parameters, transitions, s3_bucket,
              s3_prefix, aws_region):
    checkpoint_dir = task_parameters.checkpoint_restore_path
    wait_for_checkpoint(checkpoint_dir, graph_manager.data_store)

    if utils.do_model_selection(s3_bucket=s3_bucket,
                                s3_prefix=s3_prefix,
                                region=aws_region,
                                checkpoint_type=LAST_CHECKPOINT):
        screen.log_title(" Validating Last Checkpoint: {}".format(
            utils.get_last_checkpoint(s3_bucket, s3_prefix, aws_region)))
        graph_manager.create_graph(task_parameters)
        graph_manager.phase = RunPhase.TEST
        screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint")
        graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transitions=transitions)
        screen.log_title(
            " emulate_act_on_trainer on Last Checkpoint completed!")
        # Best checkpoint might not exist.
        if utils.do_model_selection(s3_bucket=s3_bucket,
                                    s3_prefix=s3_prefix,
                                    region=aws_region,
                                    checkpoint_type=BEST_CHECKPOINT):
            screen.log_title(" Validating Best Checkpoint: {}".format(
                utils.get_best_checkpoint(s3_bucket, s3_prefix, aws_region)))
            graph_manager.data_store.load_from_store()
            graph_manager.restore_checkpoint()
            screen.log_title(
                " Start emulate_act_on_trainer on Best Checkpoint")
            graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                                 transitions=transitions)
            screen.log_title(
                " emulate_act_on_trainer on Best Checkpoint completed!")
        else:
            screen.log_title(" No Best Checkpoint to validate.")

    else:
        screen.log_title(" Validating Last Checkpoint")
        graph_manager.create_graph(task_parameters)
        graph_manager.phase = RunPhase.TEST
        screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint ")
        graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transitions=transitions)
        screen.log_title(
            " Start emulate_act_on_trainer on Last Checkpoint completed!")
    screen.log_title(" Validation completed!")
예제 #19
0
 def __init__(self,
              improve_steps=TrainingSteps(10000000000),
              steps_between_evaluation_periods=EnvironmentEpisodes(50),
              evaluation_steps=EnvironmentEpisodes(5)):
     super().__init__()
     self.heatup_steps = EnvironmentSteps(0)
     self.evaluation_steps = evaluation_steps
     self.steps_between_evaluation_periods = steps_between_evaluation_periods
     self.improve_steps = improve_steps
예제 #20
0
 def __init__(self):
     super().__init__()
     self.num_predicted_steps_ahead = 6
     self.goal_vector = [1.0, 1.0]
     self.future_measurements_weights = [0.5, 0.5, 1.0]
     self.use_accumulated_reward_as_measurement = False
     self.handling_targets_after_episode_end = HandlingTargetsAfterEpisodeEnd.NAN
     self.scale_measurements_targets = {}
     self.num_consecutive_playing_steps = EnvironmentSteps(8)
def test_piece_wise_schedule():
    # decreasing schedule
    schedule = PieceWiseSchedule(
        [(LinearSchedule(1, 3, 10), EnvironmentSteps(5)),
         (ConstantSchedule(4), EnvironmentSteps(10)),
         (ExponentialSchedule(3, 1, 0.99), EnvironmentSteps(10))
         ]
    )

    target_values = np.append(np.linspace(1, 2, 6), np.ones(11)*4)
    for i in range(16):
        assert round(schedule.current_value, 4) == round(target_values[i], 4)
        schedule.step()

    current_power = 1
    for i in range(10):
        assert round(schedule.current_value, 4) == round(3*current_power, 4)
        current_power *= 0.99
        schedule.step()
예제 #22
0
def start_graph(graph_manager: 'GraphManager',
                task_parameters: 'TaskParameters'):
    graph_manager.create_graph(task_parameters)

    # let the adventure begin
    if task_parameters.evaluate_only:
        graph_manager.evaluate(EnvironmentSteps(sys.maxsize),
                               keep_networks_in_sync=True)
    else:
        graph_manager.improve()
def get_sac_params(agent_params,
                   agent,
                   params,
                   run_type=str(RunType.ROLLOUT_WORKER)):
    for net_key in ["policy", "v", "q"]:
        agent_params.network_wrappers[net_key].learning_rate = params[
            HyperParameterKeys.LEARNING_RATE.value]
        agent_params.network_wrappers[
            net_key].input_embedders_parameters = create_input_embedder(
                agent.network_settings["input_embedders"],
                agent.network_settings["embedder_type"],
                agent.network_settings["activation_function"],
            )
        # DH: use empty middleware_embedder for q net
        if net_key != "q":
            agent_params.network_wrappers[
                net_key].middleware_parameters = create_middle_embedder(
                    agent.network_settings["middleware_embedders"],
                    agent.network_settings["embedder_type"],
                    agent.network_settings["activation_function"],
                )

        for net_key in ["policy", "q", "v"]:
            agent_params.network_wrappers[net_key].batch_size = params[
                HyperParameterKeys.BATCH_SIZE.value]
            agent_params.network_wrappers[net_key].optimizer_epsilon = 1e-5
            agent_params.network_wrappers[net_key].adam_optimizer_beta2 = 0.999
            if params[HyperParameterKeys.LOSS_TYPE.
                      value] == LossTypes.HUBER.value:
                agent_params.network_wrappers[
                    net_key].replace_mse_with_huber_loss = True
    agent_params.network_wrappers["policy"].heads_parameters[
        0].sac_alpha = params[HyperParameterKeys.SAC_ALPHA.value]
    # Rescale action values in the policy head
    agent_params.network_wrappers["policy"].heads_parameters[
        0].rescale_action_values = True
    agent_params.algorithm.discount = params[
        HyperParameterKeys.DISCOUNT_FACTOR.value]
    # DH: should set num_steps_between_copying_online_weights_to_target as EnvironmentSteps instead of EnvironmentEpisodes.
    # see agent.py should_copy_online_weight...
    agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
        params[HyperParameterKeys.NUM_EPISODES_BETWEEN_TRAINING.value])
    agent_params.algorithm.distributed_coach_synchronization_type = (
        DistributedCoachSynchronizationType.SYNC)
    # tau=1
    agent_params.algorithm.rate_for_copying_weights_to_target = 1
    agent_params.algorithm.use_deterministic_for_evaluation = True

    # DH: ----to address the training worker fetch issue--------------------------
    if run_type == str(RunType.TRAINER):
        agent_params.memory = ExperienceReplayParameters()
    elif run_type == str(RunType.ROLLOUT_WORKER):
        agent_params.memory = DeepRacerMemoryParameters(
        )  # EpisodicExperienceReplayParameters()
    return agent_params
예제 #24
0
 def __init__(self):
     super().__init__()
     self.dnd_size = 500000
     self.l2_norm_added_delta = 0.001
     self.new_value_shift_coefficient = 0.1
     self.number_of_knn = 50
     self.DND_key_error_threshold = 0
     self.num_consecutive_playing_steps = EnvironmentSteps(4)
     self.propagate_updates_to_DND = False
     self.n_step = 100
     self.bootstrap_total_return_from_old_policy = True
예제 #25
0
def start_graph(graph_manager: 'GraphManager', task_parameters: 'TaskParameters'):
    """
    Runs the graph_manager using the configured task_parameters.
    This stand-alone method is a convenience for multiprocessing.
    """
    graph_manager.create_graph(task_parameters)

    # let the adventure begin
    if task_parameters.evaluate_only:
        graph_manager.evaluate(EnvironmentSteps(sys.maxsize), keep_networks_in_sync=True)
    else:
        graph_manager.improve()
예제 #26
0
def rollout_worker(graph_manager, data_store, num_workers, task_parameters):
    """
    wait for first checkpoint then perform rollouts using the model
    """
    checkpoint_dir = task_parameters.checkpoint_restore_path
    wait_for_checkpoint(checkpoint_dir, data_store)

    graph_manager.create_graph(task_parameters)
    with graph_manager.phase_context(RunPhase.TRAIN):

        chkpt_state_reader = CheckpointStateReader(
            checkpoint_dir, checkpoint_state_optional=False)
        last_checkpoint = 0

        # this worker should play a fraction of the total playing steps per rollout
        act_steps = math.ceil(
            (graph_manager.agent_params.algorithm.
             num_consecutive_playing_steps.num_steps) / num_workers)

        for i in range(int(graph_manager.improve_steps.num_steps / act_steps)):

            if should_stop(checkpoint_dir):
                break

            if type(graph_manager.agent_params.algorithm.
                    num_consecutive_playing_steps) == EnvironmentSteps:
                graph_manager.act(EnvironmentSteps(num_steps=act_steps),
                                  wait_for_full_episodes=graph_manager.
                                  agent_params.algorithm.act_for_full_episodes)
            elif type(graph_manager.agent_params.algorithm.
                      num_consecutive_playing_steps) == EnvironmentEpisodes:
                graph_manager.act(EnvironmentEpisodes(num_steps=act_steps))

            new_checkpoint = chkpt_state_reader.get_latest()
            if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.SYNC:
                while new_checkpoint is None or new_checkpoint.num < last_checkpoint + 1:
                    if should_stop(checkpoint_dir):
                        break
                    if data_store:
                        data_store.load_from_store()
                    new_checkpoint = chkpt_state_reader.get_latest()

                graph_manager.restore_checkpoint()

            if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.ASYNC:
                if new_checkpoint is not None and new_checkpoint.num > last_checkpoint:
                    graph_manager.restore_checkpoint()

            if new_checkpoint is not None:
                last_checkpoint = new_checkpoint.num
예제 #27
0
 def __init__(self):
     super().__init__()
     self.num_episodes_in_experience_replay = 1000000
     self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
     self.gae_lambda = 0.95
     self.use_kl_regularization = False
     self.clip_likelihood_ratio_using_epsilon = 0.2
     self.estimate_state_value_using_gae = True
     self.beta_entropy = 0.01  # should be 0 for mujoco
     self.num_consecutive_playing_steps = EnvironmentSteps(2048)
     self.optimization_epochs = 10
     self.normalization_stats = None
     self.clipping_decay_schedule = ConstantSchedule(1)
     self.act_for_full_episodes = True
예제 #28
0
 def __init__(self):
     super().__init__()
     self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
     self.gae_lambda = 0.96
     self.target_kl_divergence = 0.01
     self.initial_kl_coefficient = 1.0
     self.high_kl_penalty_coefficient = 1000
     self.clip_likelihood_ratio_using_epsilon = None
     self.value_targets_mix_fraction = 0.1
     self.estimate_state_value_using_gae = True
     self.use_kl_regularization = True
     self.beta_entropy = 0.01
     self.num_consecutive_playing_steps = EnvironmentSteps(5000)
     self.act_for_full_episodes = True
예제 #29
0
def rollout_worker(graph_manager, checkpoint_dir, data_store, num_workers):
    """
    wait for first checkpoint then perform rollouts using the model
    """
    wait_for_checkpoint(checkpoint_dir)

    task_parameters = TaskParameters()
    task_parameters.__dict__['checkpoint_restore_dir'] = checkpoint_dir

    graph_manager.create_graph(task_parameters)
    with graph_manager.phase_context(RunPhase.TRAIN):

        last_checkpoint = 0

        act_steps = math.ceil(
            (graph_manager.agent_params.algorithm.
             num_consecutive_playing_steps.num_steps) / num_workers)

        for i in range(int(graph_manager.improve_steps.num_steps / act_steps)):

            if should_stop(checkpoint_dir):
                break

            if type(graph_manager.agent_params.algorithm.
                    num_consecutive_playing_steps) == EnvironmentSteps:
                graph_manager.act(EnvironmentSteps(num_steps=act_steps),
                                  wait_for_full_episodes=graph_manager.
                                  agent_params.algorithm.act_for_full_episodes)
            elif type(graph_manager.agent_params.algorithm.
                      num_consecutive_playing_steps) == EnvironmentEpisodes:
                graph_manager.act(EnvironmentEpisodes(num_steps=act_steps))

            new_checkpoint = get_latest_checkpoint(checkpoint_dir)

            if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.SYNC:
                while new_checkpoint < last_checkpoint + 1:
                    if should_stop(checkpoint_dir):
                        break
                    if data_store:
                        data_store.load_from_store()
                    new_checkpoint = get_latest_checkpoint(checkpoint_dir)

                graph_manager.restore_checkpoint()

            if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.ASYNC:
                if new_checkpoint > last_checkpoint:
                    graph_manager.restore_checkpoint()

            last_checkpoint = new_checkpoint
예제 #30
0
파일: batch_rl.py 프로젝트: guyk1971/coach
def set_schedule_params(n_epochs, dataset_size):
    schedule_params = ScheduleParameters()

    # 100 epochs (we run train over all the dataset, every epoch) of training
    schedule_params.improve_steps = TrainingSteps(n_epochs)

    # we evaluate the model every epoch
    schedule_params.steps_between_evaluation_periods = TrainingSteps(1)

    # only for when we have an enviroment
    schedule_params.evaluation_steps = EnvironmentEpisodes(10)
    # to have it pure random we set the entire dataset to be created during heatup
    # does it mean pure random ? or is it using uninitialized network ?
    schedule_params.heatup_steps = EnvironmentSteps(dataset_size)
    return schedule_params