Exemplo n.º 1
0
 def __init__(self):
     super().__init__(algorithm=SoftActorCriticAlgorithmParameters(),
                      exploration=AdditiveNoiseParameters(),
                      memory=ExperienceReplayParameters(),   # SAC doesnt use episodic related data
                      # network wrappers:
                      networks=OrderedDict([("policy", SACPolicyNetworkParameters()),
                                            ("q", SACCriticNetworkParameters()),
                                            ("v", SACValueNetworkParameters())]))
Exemplo n.º 2
0
 def __init__(self):
     super().__init__()
     self.epsilon_schedule = LinearSchedule(0.5, 0.01, 50000)
     self.evaluation_epsilon = 0.05
     self.continuous_exploration_policy_parameters = AdditiveNoiseParameters(
     )
     self.continuous_exploration_policy_parameters.noise_schedule = LinearSchedule(
         0.1, 0.1, 50000)
Exemplo n.º 3
0
 def __init__(self):
     super().__init__(algorithm=PolicyGradientAlgorithmParameters(),
                      exploration={
                          DiscreteActionSpace: CategoricalParameters(),
                          BoxActionSpace: AdditiveNoiseParameters()
                      },
                      memory=SingleEpisodeBufferParameters(),
                      networks={"main": PolicyGradientNetworkParameters()})
Exemplo n.º 4
0
 def __init__(self):
     super().__init__(algorithm=ClippedPPOAlgorithmParameters(),
                      exploration={
                          DiscreteActionSpace: CategoricalParameters(),
                          BoxActionSpace: AdditiveNoiseParameters()
                      },
                      memory=EpisodicExperienceReplayParameters(),
                      networks={"main": ClippedPPONetworkParameters()})
Exemplo n.º 5
0
 def __init__(self):
     super().__init__(algorithm=PPOAlgorithmParameters(),
                      exploration=AdditiveNoiseParameters(),
                      memory=EpisodicExperienceReplayParameters(),
                      networks={
                          "critic": PPOCriticNetworkParameters(),
                          "actor": PPOActorNetworkParameters()
                      })
Exemplo n.º 6
0
 def __init__(self):
     super().__init__(algorithm=PPOAlgorithmParameters(),
                      exploration={
                          DiscreteActionSpace: CategoricalParameters(),
                          BoxActionSpace: AdditiveNoiseParameters()
                      },
                      memory=EpisodicExperienceReplayParameters(),
                      networks={
                          "critic": PPOCriticNetworkParameters(),
                          "actor": PPOActorNetworkParameters()
                      })
Exemplo n.º 7
0
 def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
              architecture_num_q_heads: int, lamb: int,
              continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters()):
     """
     :param action_space: the action space used by the environment
     :param epsilon_schedule: a schedule for the epsilon values
     :param evaluation_epsilon: the epsilon value to use for evaluation phases
     :param architecture_num_q_heads: the number of q heads to select from
     :param lamb: lambda coefficient for taking the standard deviation into account
     :param continuous_exploration_policy_parameters: the parameters of the continuous exploration policy to use
                                                      if the e-greedy is used for a continuous policy
     """
     super().__init__(action_space, epsilon_schedule, evaluation_epsilon, continuous_exploration_policy_parameters)
     self.num_heads = architecture_num_q_heads
     self.lamb = lamb
     self.std = 0
     self.last_action_values = 0
Exemplo n.º 8
0
    def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule,
                 evaluation_epsilon: float,
                 continuous_exploration_policy_parameters: ExplorationParameters=AdditiveNoiseParameters()):
        """
        :param action_space: the action space used by the environment
        :param epsilon_schedule: a schedule for the epsilon values
        :param evaluation_epsilon: the epsilon value to use for evaluation phases
        :param continuous_exploration_policy_parameters: the parameters of the continuous exploration policy to use
                                                         if the e-greedy is used for a continuous policy
        """
        super().__init__(action_space)
        self.epsilon_schedule = epsilon_schedule
        self.evaluation_epsilon = evaluation_epsilon

        if isinstance(self.action_space, BoxActionSpace):
            # for continuous e-greedy (see http://www.cs.ubc.ca/~van/papers/2017-TOG-deepLoco/2017-TOG-deepLoco.pdf)
            continuous_exploration_policy_parameters.action_space = action_space
            self.continuous_exploration_policy = \
                dynamic_import_and_instantiate_module_from_params(continuous_exploration_policy_parameters)

        self.current_random_value = np.random.rand()
Exemplo n.º 9
0
 def __init__(
     self,
     action_space: ActionSpace,
     epsilon_schedule: Schedule,
     evaluation_epsilon: float,
     architecture_num_q_heads: int,
     continuous_exploration_policy_parameters:
     ExplorationParameters = AdditiveNoiseParameters(),
 ):
     """
     :param action_space: the action space used by the environment
     :param epsilon_schedule: a schedule for the epsilon values
     :param evaluation_epsilon: the epsilon value to use for evaluation phases
     :param continuous_exploration_policy_parameters: the parameters of the continuous exploration policy to use
                                                      if the e-greedy is used for a continuous policy
     :param architecture_num_q_heads: the number of q heads to select from
     """
     super().__init__(action_space, epsilon_schedule, evaluation_epsilon,
                      continuous_exploration_policy_parameters)
     self.num_heads = architecture_num_q_heads
     self.selected_head = 0
     self.last_action_values = 0
Exemplo n.º 10
0
# crop and rescale the image + use only the forward speed measurement
agent_params.input_filter = InputFilter()
agent_params.input_filter.add_observation_filter('CameraRGB', 'cropping',
                                                 ObservationCropFilter(crop_low=np.array([115, 0, 0]),
                                                                       crop_high=np.array([510, -1, -1])))
agent_params.input_filter.add_observation_filter('CameraRGB', 'rescale',
                                                 ObservationRescaleToSizeFilter(
                                                     ImageObservationSpace(np.array([88, 200, 3]), high=255)))
agent_params.input_filter.add_observation_filter('CameraRGB', 'to_uint8', ObservationToUInt8Filter(0, 255))
agent_params.input_filter.add_observation_filter(
    'measurements', 'select_speed',
    ObservationReductionBySubPartsNameFilter(
        ["forward_speed"], reduction_method=ObservationReductionBySubPartsNameFilter.ReductionMethod.Keep))

# no exploration is used
agent_params.exploration = AdditiveNoiseParameters()
agent_params.exploration.noise_percentage_schedule = ConstantSchedule(0)
agent_params.exploration.evaluation_noise_percentage = 0

# no playing during the training phase
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(0)

# use the following command line to download and extract the CARLA dataset:
# python rl_coach/utilities/carla_dataset_to_replay_buffer.py
agent_params.memory.load_memory_from_file_path = "./datasets/carla_train_set_replay_buffer.p"
agent_params.memory.state_key_with_the_class_index = 'high_level_command'
agent_params.memory.num_classes = 4

# download dataset if it doesn't exist
if not os.path.exists(agent_params.memory.load_memory_from_file_path):
    screen.log_title("The CARLA dataset is not present in the following path: {}"
Exemplo n.º 11
0
agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999

agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2
agent_params.algorithm.clipping_decay_schedule = LinearSchedule(
    1.0, 0, 1000000)
agent_params.algorithm.beta_entropy = 0.01  # also try 0.001
agent_params.algorithm.gae_lambda = 0.95
agent_params.algorithm.discount = 0.9
agent_params.algorithm.optimization_epochs = 10
agent_params.algorithm.estimate_state_value_using_gae = True
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes(
    5)
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(5)
#exploration = CategoricalParameters()
exploration = AdditiveNoiseParameters()
#exploration  = EGreedyParameters()
#exploration.epsilon_schedule = LinearSchedule(0.5, 0.01, 100000)
agent_params.exploration = exploration
agent_params.memory.max_size = (MemoryGranularity.Transitions, 10**5)

###############
# Environment #
###############
DeepRacerInputFilter = InputFilter(is_a_reference_filter=True)
DeepRacerInputFilter.add_observation_filter('observation', 'to_grayscale',
                                            ObservationRGBToYFilter())
DeepRacerInputFilter.add_observation_filter('observation', 'to_uint8',
                                            ObservationToUInt8Filter(0, 255))
DeepRacerInputFilter.add_observation_filter('observation', 'stacking',
                                            ObservationStackingFilter(1))
Exemplo n.º 12
0
 def __init__(self):
     super().__init__(algorithm=PolicyGradientAlgorithmParameters(),
                      exploration=AdditiveNoiseParameters(),
                      memory=SingleEpisodeBufferParameters(),
                      networks={"main": PolicyGradientNetworkParameters()})
Exemplo n.º 13
0
 def __init__(self):
     super().__init__(algorithm=ClippedPPOAlgorithmParameters(),
                      exploration=AdditiveNoiseParameters(),
                      memory=EpisodicExperienceReplayParameters(),
                      networks={"main": ClippedPPONetworkParameters()})
Exemplo n.º 14
0
def get_clipped_ppo_params(agent_params, agent, params):
    """This function is algorithm specific settings required for Clipped PPO algorithm

    Args:
        agent_params (DeepRacerClippedPPOAgentParams): the agent parameters that will be used to create the RL agent
        agent (Agent): The agent object that was created either as part of create_rollout_agent or create_training_agent
        params (dict): dictionary of hyperparameters

    Returns:
        DeepRacerClippedPPOAgentParams: updated agent params object with hyperparameters and other required details
    """
    agent_params.network_wrappers['main'].learning_rate = params[
        HyperParameterKeys.LEARNING_RATE.value]

    agent_params.network_wrappers['main'].input_embedders_parameters = \
        create_input_embedder(agent.network_settings['input_embedders'],
                              agent.network_settings['embedder_type'],
                              agent.network_settings['activation_function'])
    agent_params.network_wrappers['main'].middleware_parameters = \
        create_middle_embedder(agent.network_settings['middleware_embedders'],
                               agent.network_settings['embedder_type'],
                               agent.network_settings['activation_function'])

    agent_params.network_wrappers['main'].batch_size = params[
        HyperParameterKeys.BATCH_SIZE.value]
    agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
    agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999

    if params[HyperParameterKeys.LOSS_TYPE.value] == LossTypes.HUBER.value:
        agent_params.network_wrappers[
            'main'].replace_mse_with_huber_loss = True

    agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2
    agent_params.algorithm.beta_entropy = params[
        HyperParameterKeys.BETA_ENTROPY.value]
    agent_params.algorithm.gae_lambda = 0.95
    agent_params.algorithm.discount = params[
        HyperParameterKeys.DISCOUNT_FACTOR.value]
    agent_params.algorithm.optimization_epochs = params[
        HyperParameterKeys.NUM_EPOCHS.value]
    agent_params.algorithm.estimate_state_value_using_gae = True
    agent_params.algorithm.num_steps_between_copying_online_weights_to_target = \
        EnvironmentEpisodes(params[HyperParameterKeys.NUM_EPISODES_BETWEEN_TRAINING.value])
    agent_params.algorithm.num_consecutive_playing_steps = \
        EnvironmentEpisodes(params[HyperParameterKeys.NUM_EPISODES_BETWEEN_TRAINING.value])

    agent_params.algorithm.distributed_coach_synchronization_type = \
        DistributedCoachSynchronizationType.SYNC
    if params[HyperParameterKeys.EXPLORATION_TYPE.value].lower().strip(
    ) == ExplorationTypes.CATEGORICAL.value:
        agent_params.exploration = {
            DiscreteActionSpace:
            DeepRacerCategoricalParameters(
                use_stochastic_evaluation_policy=False),
            ScalableBoxActionSpace:
            AdditiveNoiseParameters()
        }
    elif params[HyperParameterKeys.EXPLORATION_TYPE.value].lower().strip(
    ) == ExplorationTypes.E_GREEDY.value:
        agent_params.exploration = {
            DiscreteActionSpace: EGreedyParameters(),
            ScalableBoxActionSpace: AdditiveNoiseParameters()
        }
        agent_params.exploration[DiscreteActionSpace].epsilon_schedule = \
            LinearSchedule(1.0,
                           params[HyperParameterKeys.E_GREEDY_VALUE.value],
                           params[HyperParameterKeys.EPSILON_STEPS.value])
    else:
        log_and_exit(
            "Unknown exploration_type found in hyper parameters. \
            exploration_type: {}".format(params[
                HyperParameterKeys.EXPLORATION_TYPE.value].lower().strip()),
            SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)

    agent_params.memory = DeepRacerMemoryParameters()
    return agent_params