Exemplo n.º 1
0
 def __init__(self):
     super().__init__()
     self.epsilon_schedule = LinearSchedule(0.5, 0.01, 50000)
     self.evaluation_epsilon = 0.05
     self.continuous_exploration_policy_parameters = AdditiveNoiseParameters(
     )
     self.continuous_exploration_policy_parameters.noise_schedule = LinearSchedule(
         0.1, 0.1, 50000)
Exemplo n.º 2
0
 def __init__(self):
     super().__init__()
     self.architecture_num_q_heads = 10
     self.bootstrapped_data_sharing_probability = 1.0
     self.epsilon_schedule = PieceWiseSchedule([
         (LinearSchedule(1, 0.1, 1000000), EnvironmentSteps(1000000)),
         (LinearSchedule(0.1, 0.01, 4000000), EnvironmentSteps(4000000))
     ])
     self.lamb = 0.1
Exemplo n.º 3
0
 def __init__(self):
     super().__init__()
     self.noise_schedule = LinearSchedule(0.1, 0.1, 50000)
     self.evaluation_noise = 0.05
     self.clip_low = 0
     self.clip_high = 1
     self.noise_as_percentage_from_action_space = True
Exemplo n.º 4
0
 def __init__(self):
     super().__init__(algorithm=DQNAlgorithmParameters(),
                      exploration=EGreedyParameters(),
                      memory=ExperienceReplayParameters(),
                      networks={"main": DQNNetworkParameters()})
     self.exploration.epsilon_schedule = LinearSchedule(1, 0.1, 1000000)
     self.exploration.evaluation_epsilon = 0.05
Exemplo n.º 5
0
def set_agent_params(agent_params_func):
    #########
    # Agent #
    #########
    agent_params = agent_params_func()
    agent_params.network_wrappers['main'].batch_size = 128
    agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(
        100)
    agent_params.algorithm.discount = 0.99

    # to jump start the agent's q values, and speed things up, we'll initialize the last Dense layer's bias
    # with a number in the order of the discounted reward of a random policy
    agent_params.network_wrappers['main'].heads_parameters = \
        [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]
    # agent_params.network_wrappers['main'].heads_parameters = \
    #     [QHeadParameters(output_bias_initializer=tf.constant_initializer(0))]

    # NN configuration
    agent_params.network_wrappers['main'].learning_rate = 0.0001
    agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
    agent_params.network_wrappers['main'].softmax_temperature = 0.2

    # ER - we'll need an episodic replay buffer for off-policy evaluation
    agent_params.memory = EpisodicExperienceReplayParameters()

    # E-Greedy schedule - there is no exploration in Batch RL. Disabling E-Greedy.
    agent_params.exploration.epsilon_schedule = LinearSchedule(initial_value=0,
                                                               final_value=0,
                                                               decay_steps=1)
    agent_params.exploration.evaluation_epsilon = 0
    return agent_params
Exemplo n.º 6
0
 def __init__(self):
     super().__init__()
     self.algorithm = QuantileRegressionDQNAlgorithmParameters()
     self.network_wrappers = {
         "main": QuantileRegressionDQNNetworkParameters()
     }
     self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
     self.exploration.evaluation_epsilon = 0.001
Exemplo n.º 7
0
def test_init():
    # discrete control
    action_space = DiscreteActionSpace(3)
    noise_schedule = LinearSchedule(1.0, 1.0, 1000)

    # additive noise requires a bounded range for the actions
    action_space = BoxActionSpace(np.array([10]))
    with pytest.raises(ValueError):
        policy = AdditiveNoise(action_space, noise_schedule, 0)
Exemplo n.º 8
0
 def __init__(self):
     super().__init__(algorithm=DQNAlgorithmParameters(),
                      exploration=EGreedyParameters(),
                      memory=ExperienceReplayParameters(),
                      networks={
                          "main": DQNNetworkParameters(),
                          "predictor": RNDNetworkParameters(),
                          "constant": RNDNetworkParameters()
                      })
     self.exploration.epsilon_schedule = LinearSchedule(1.0, 0.15, 15000)
def test_get_action():
    # make sure noise is in range
    action_space = BoxActionSpace(np.array([10]), -1, 1)
    noise_schedule = LinearSchedule(1.0, 1.0, 1000)
    policy = AdditiveNoise(action_space, noise_schedule, 0)

    # the action range is 2, so there is a ~0.1% chance that the noise will be larger than 3*std=3*2=6
    for i in range(1000):
        action = policy.get_action(np.zeros([10]))
        assert np.all(action < 10)
        # make sure there is no clipping of the action since it should be the environment that clips actions
        assert np.all(action != 1.0)
        assert np.all(action != -1.0)
        # make sure that each action element has a different value
        assert np.all(action[0] != action[1:])
def test_change_phase():
    # discrete control
    action_space = DiscreteActionSpace(3)
    epsilon_schedule = LinearSchedule(1.0, 0.1, 1000)
    policy = EGreedy(action_space, epsilon_schedule, evaluation_epsilon=0.01)

    # verify schedule not applying if not in training phase
    assert policy.get_control_param() == 1.0
    policy.change_phase(RunPhase.TEST)
    best_action = policy.get_action(np.array([10, 20, 30]))
    assert policy.epsilon_schedule.current_value == 1.0
    policy.change_phase(RunPhase.HEATUP)
    best_action = policy.get_action(np.array([10, 20, 30]))
    assert policy.epsilon_schedule.current_value == 1.0
    policy.change_phase(RunPhase.UNDEFINED)
    best_action = policy.get_action(np.array([10, 20, 30]))
    assert policy.epsilon_schedule.current_value == 1.0
def test_piece_wise_schedule():
    # decreasing schedule
    schedule = PieceWiseSchedule(
        [(LinearSchedule(1, 3, 10), EnvironmentSteps(5)),
         (ConstantSchedule(4), EnvironmentSteps(10)),
         (ExponentialSchedule(3, 1, 0.99), EnvironmentSteps(10))
         ]
    )

    target_values = np.append(np.linspace(1, 2, 6), np.ones(11)*4)
    for i in range(16):
        assert round(schedule.current_value, 4) == round(target_values[i], 4)
        schedule.step()

    current_power = 1
    for i in range(10):
        assert round(schedule.current_value, 4) == round(3*current_power, 4)
        current_power *= 0.99
        schedule.step()
def test_get_action():
    # discrete control
    action_space = DiscreteActionSpace(3)
    epsilon_schedule = LinearSchedule(1.0, 1.0, 1000)
    policy = EGreedy(action_space, epsilon_schedule, evaluation_epsilon=0)

    # verify that test phase gives greedy actions (evaluation_epsilon = 0)
    policy.change_phase(RunPhase.TEST)
    for i in range(100):
        best_action, _ = policy.get_action(np.array([10, 20, 30]))
        assert best_action == 2

    # verify that train phase gives uniform actions (exploration = 1)
    policy.change_phase(RunPhase.TRAIN)
    counters = np.array([0, 0, 0])
    for i in range(30000):
        best_action, _ = policy.get_action(np.array([10, 20, 30]))
        counters[best_action] += 1
    assert np.all(counters > 9500)  # this is noisy so we allow 5% error
def test_get_control_param():
    # discrete control
    action_space = DiscreteActionSpace(3)
    epsilon_schedule = LinearSchedule(1.0, 0.1, 1000)
    policy = EGreedy(action_space, epsilon_schedule, evaluation_epsilon=0.01)

    # verify schedule applies to TRAIN phase
    policy.change_phase(RunPhase.TRAIN)
    for i in range(999):
        best_action = policy.get_action(np.array([10, 20, 30]))
        assert 1.0 > policy.get_control_param() > 0.1
    best_action = policy.get_action(np.array([10, 20, 30]))
    assert policy.get_control_param() == 0.1

    # test phases
    policy.change_phase(RunPhase.TEST)
    assert policy.get_control_param() == 0.01

    policy.change_phase(RunPhase.TRAIN)
    assert policy.get_control_param() == 0.1

    policy.change_phase(RunPhase.HEATUP)
    assert policy.get_control_param() == 0.1
Exemplo n.º 14
0
schedule_params = ScheduleParameters()
schedule_params.improve_steps = EnvironmentSteps(50000000)
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(1000000)
schedule_params.evaluation_steps = EnvironmentSteps(125000)
schedule_params.heatup_steps = EnvironmentSteps(20000)

#########
# Agent #
#########
agent_params = RainbowDQNAgentParameters()

agent_params.network_wrappers['main'].learning_rate = 0.0000625
agent_params.network_wrappers['main'].optimizer_epsilon = 1.5e-4
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
    32000 // 4)  # 32k frames
agent_params.memory.beta = LinearSchedule(
    0.4, 1, 12500000)  # 12.5M training iterations = 50M steps = 200M frames
agent_params.memory.alpha = 0.5

###############
# Environment #
###############
env_params = Atari()
env_params.level = SingleLevelSelection(atari_deterministic_v4)

vis_params = VisualizationParameters()
vis_params.video_dump_methods = [
    SelectedPhaseOnlyDumpMethod(RunPhase.TEST),
    MaxDumpMethod()
]
vis_params.dump_mp4 = False
def get_graph_manager(**hp_dict):
    ####################
    # All Default Parameters #
    ####################
    params = {}
    params["batch_size"] = int(hp_dict.get("batch_size", 64))
    params["num_epochs"] = int(hp_dict.get("num_epochs", 10))
    params["stack_size"] = int(hp_dict.get("stack_size", 1))
    params["lr"] = float(hp_dict.get("lr", 0.0003))
    params["exploration_type"] = (hp_dict.get("exploration_type",
                                              "huber")).lower()
    params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05))
    params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000))
    params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01))
    params["discount_factor"] = float(hp_dict.get("discount_factor", .999))
    params["loss_type"] = hp_dict.get("loss_type",
                                      "Mean squared error").lower()
    params["num_episodes_between_training"] = int(
        hp_dict.get("num_episodes_between_training", 20))
    params["term_cond_max_episodes"] = int(
        hp_dict.get("term_cond_max_episodes", 100000))
    params["term_cond_avg_score"] = float(
        hp_dict.get("term_cond_avg_score", 100000))

    params_json = json.dumps(params, indent=2, sort_keys=True)
    print("Using the following hyper-parameters", params_json, sep='\n')

    ####################
    # Graph Scheduling #
    ####################
    schedule_params = ScheduleParameters()
    schedule_params.improve_steps = TrainingSteps(
        params["term_cond_max_episodes"])
    schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40)
    schedule_params.evaluation_steps = EnvironmentEpisodes(5)
    schedule_params.heatup_steps = EnvironmentSteps(0)

    #########
    # Agent #
    #########
    agent_params = ClippedPPOAgentParameters()

    agent_params.network_wrappers['main'].learning_rate = params["lr"]
    agent_params.network_wrappers['main'].input_embedders_parameters[
        'observation'].activation_function = 'relu'
    agent_params.network_wrappers[
        'main'].middleware_parameters.activation_function = 'relu'
    agent_params.network_wrappers['main'].batch_size = params["batch_size"]
    agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
    agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999

    if params["loss_type"] == "huber":
        agent_params.network_wrappers[
            'main'].replace_mse_with_huber_loss = True

    agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2
    agent_params.algorithm.clipping_decay_schedule = LinearSchedule(
        1.0, 0, 1000000)
    agent_params.algorithm.beta_entropy = params["beta_entropy"]
    agent_params.algorithm.gae_lambda = 0.95
    agent_params.algorithm.discount = params["discount_factor"]
    agent_params.algorithm.optimization_epochs = params["num_epochs"]
    agent_params.algorithm.estimate_state_value_using_gae = True
    agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes(
        params["num_episodes_between_training"])
    agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(
        params["num_episodes_between_training"])

    agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC

    if params["exploration_type"] == "categorical":
        agent_params.exploration = CategoricalParameters()
    else:
        agent_params.exploration = EGreedyParameters()
        agent_params.exploration.epsilon_schedule = LinearSchedule(
            1.0, params["e_greedy_value"], params["epsilon_steps"])

    ###############
    # Environment #
    ###############
    SilverstoneInputFilter = InputFilter(is_a_reference_filter=True)
    SilverstoneInputFilter.add_observation_filter('observation',
                                                  'to_grayscale',
                                                  ObservationRGBToYFilter())
    SilverstoneInputFilter.add_observation_filter(
        'observation', 'to_uint8', ObservationToUInt8Filter(0, 255))
    SilverstoneInputFilter.add_observation_filter(
        'observation', 'stacking',
        ObservationStackingFilter(params["stack_size"]))

    env_params = GymVectorEnvironment()
    env_params.default_input_filter = SilverstoneInputFilter
    env_params.level = 'SilverstoneRacetrack-Discrete-v0'

    vis_params = VisualizationParameters()
    vis_params.dump_mp4 = False

    ########
    # Test #
    ########
    preset_validation_params = PresetValidationParameters()
    preset_validation_params.test = True
    preset_validation_params.min_reward_threshold = 400
    preset_validation_params.max_episodes_to_achieve_reward = 1000

    graph_manager = BasicRLGraphManager(
        agent_params=agent_params,
        env_params=env_params,
        schedule_params=schedule_params,
        vis_params=vis_params,
        preset_validation_params=preset_validation_params)
    return graph_manager, params_json
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
schedule_params.heatup_steps = EnvironmentSteps(N)

####################
# DQN Agent Params #
####################
agent_params = DDQNAgentParameters()
agent_params.network_wrappers['main'].learning_rate = 0.00025
agent_params.network_wrappers['main'].heads_parameters = [
    DuelingQHeadParameters()
]
agent_params.memory.max_size = (MemoryGranularity.Transitions, 1000000)
agent_params.algorithm.discount = 0.99
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(4)
agent_params.exploration.epsilon_schedule = LinearSchedule(
    1, 0.1, (N + 7) * 2000)
agent_params.input_filter = NoInputFilter()
agent_params.output_filter = NoOutputFilter()

###############
# Environment #
###############
env_params = GymEnvironmentParameters()
env_params.level = 'rl_coach.environments.toy_problems.exploration_chain:ExplorationChain'
env_params.additional_simulator_parameters = {
    'chain_length': N,
    'max_steps': N + 7
}

vis_params = VisualizationParameters()
schedule_params.improve_steps = EnvironmentSteps(6250000)
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(62500)
schedule_params.evaluation_steps = EnvironmentSteps(6250)
schedule_params.heatup_steps = EnvironmentSteps(1)

#########
# Agent #
#########
agent_params = DFPAgentParameters()

agent_params.network_wrappers['main'].learning_rate = 0.0001
# the original DFP code decays  epsilon in ~1.5M steps. Only that unlike other most other papers, these are 1.5M
# training steps. i.e. it is equivalent to once every 8 playing steps (when a training batch is sampled).
# so this is 1.5M*8 =~ 12M playing steps per worker.
# TODO allow the epsilon schedule to be defined in terms of training steps.
agent_params.exploration.epsilon_schedule = LinearSchedule(1, 0, 12000000)
agent_params.exploration.evaluation_epsilon = 0
agent_params.algorithm.use_accumulated_reward_as_measurement = False
agent_params.algorithm.goal_vector = [0.5, 0.5, 1]  # ammo, health, frag count
agent_params.network_wrappers['main'].input_embedders_parameters[
    'measurements'].input_rescaling['vector'] = 100.
agent_params.algorithm.scale_measurements_targets['GameVariable.HEALTH'] = 30.0
agent_params.algorithm.scale_measurements_targets['GameVariable.AMMO2'] = 7.5
agent_params.algorithm.scale_measurements_targets['GameVariable.USER2'] = 1.0
agent_params.network_wrappers['main'].learning_rate_decay_rate = 0.3
agent_params.network_wrappers['main'].learning_rate_decay_steps = 250000
agent_params.network_wrappers['main'].input_embedders_parameters[
    'measurements'].input_offset['vector'] = 0.5
agent_params.network_wrappers['main'].input_embedders_parameters[
    'observation'].input_offset['vector'] = 0.5
Exemplo n.º 18
0
 def __init__(self):
     super().__init__()
     self.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(30000)
     self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
     self.exploration.evaluation_epsilon = 0.001
Exemplo n.º 19
0
 def __init__(self):
     super().__init__()
     self.architecture_num_q_heads = 10
     self.bootstrapped_data_sharing_probability = 1.0
     self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
Exemplo n.º 20
0
 def __init__(self):
     super().__init__()
     self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
     self.evaluation_epsilon = 0.001
Exemplo n.º 21
0
 def __init__(self):
     super().__init__()
     self.algorithm = DDQNBCQAlgorithmParameters()
     self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
     self.exploration.evaluation_epsilon = 0.001
schedule_params.improve_steps = EnvironmentSteps(50000000)
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000)
schedule_params.evaluation_steps = EnvironmentSteps(135000)
schedule_params.heatup_steps = EnvironmentSteps(50000)

#########
# Agent #
#########
agent_params = DDQNAgentParameters()
agent_params.network_wrappers['main'].learning_rate = 0.0001
agent_params.network_wrappers['main'].middleware_parameters.scheme = MiddlewareScheme.Empty
agent_params.network_wrappers['main'].heads_parameters = [DuelingQHeadParameters()]
agent_params.network_wrappers['main'].clip_gradients = 10
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(40000)
agent_params.exploration.epsilon_schedule = PieceWiseSchedule(
    [(LinearSchedule(1, 0.1, 1000000), EnvironmentSteps(1000000)),
     (LinearSchedule(0.1, 0.01, 10000000), EnvironmentSteps(1000000)),
     (ConstantSchedule(0.001), EnvironmentSteps(10000000))]
)
agent_params.memory = PrioritizedExperienceReplayParameters()
agent_params.memory.beta = LinearSchedule(0.4, 1, 12500000)  # 12.5M training iterations = 50M steps = 200M frames

###############
# Environment #
###############
env_params = Atari()
env_params.level = SingleLevelSelection(atari_deterministic_v4)

vis_params = VisualizationParameters()
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
vis_params.dump_mp4 = False
Exemplo n.º 23
0
 def __init__(self):
     super().__init__()
     self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
     self.evaluation_noise_percentage = 0.05
Exemplo n.º 24
0
agent_params.algorithm.estimate_state_value_using_gae = True
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(2048)

agent_params.network_wrappers["main"].learning_rate = 0.0003
agent_params.network_wrappers["main"].input_embedders_parameters[
    "observation"
].activation_function = "tanh"
agent_params.network_wrappers["main"].input_embedders_parameters["observation"].scheme = [Dense(64)]
agent_params.network_wrappers["main"].middleware_parameters.scheme = [Dense(64)]
agent_params.network_wrappers["main"].middleware_parameters.activation_function = "tanh"
agent_params.network_wrappers["main"].batch_size = 64
agent_params.network_wrappers["main"].optimizer_epsilon = 1e-5
agent_params.network_wrappers["main"].clip_gradients = 40.0

agent_params.exploration = EGreedyParameters()
agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.01, 10000)

###############
# Environment #
###############
env_params = GymVectorEnvironment(level="autoscalesim:SimpleScalableWebserviceSim")

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 150
preset_validation_params.max_episodes_to_achieve_reward = 400

graph_manager = BasicRLGraphManager(
Exemplo n.º 25
0
def get_graph_manager(hp_dict, agent_list, run_phase_subject):
    ####################
    # All Default Parameters #
    ####################
    params = {}
    params["batch_size"] = int(hp_dict.get("batch_size", 64))
    params["num_epochs"] = int(hp_dict.get("num_epochs", 10))
    params["stack_size"] = int(hp_dict.get("stack_size", 1))
    params["lr"] = float(hp_dict.get("lr", 0.0003))
    params["exploration_type"] = (hp_dict.get("exploration_type",
                                              "categorical")).lower()
    params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05))
    params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000))
    params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01))
    params["discount_factor"] = float(hp_dict.get("discount_factor", .999))
    params["loss_type"] = hp_dict.get("loss_type",
                                      "Mean squared error").lower()
    params["num_episodes_between_training"] = int(
        hp_dict.get("num_episodes_between_training", 20))
    params["term_cond_max_episodes"] = int(
        hp_dict.get("term_cond_max_episodes", 100000))
    params["term_cond_avg_score"] = float(
        hp_dict.get("term_cond_avg_score", 100000))

    params_json = json.dumps(params, indent=2, sort_keys=True)
    print("Using the following hyper-parameters", params_json, sep='\n')

    ####################
    # Graph Scheduling #
    ####################
    schedule_params = ScheduleParameters()
    schedule_params.improve_steps = TrainingSteps(
        params["term_cond_max_episodes"])
    schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40)
    schedule_params.evaluation_steps = EnvironmentEpisodes(5)
    schedule_params.heatup_steps = EnvironmentSteps(0)

    #########
    # Agent #
    #########
    trainable_agents_list = list()
    non_trainable_agents_list = list()

    for agent in agent_list:
        agent_params = DeepRacerAgentParams()
        if agent.network_settings:
            agent_params.env_agent = agent
            agent_params.network_wrappers['main'].learning_rate = params["lr"]

            agent_params.network_wrappers['main'].input_embedders_parameters = \
                create_input_embedder(agent.network_settings['input_embedders'],
                                      agent.network_settings['embedder_type'],
                                      agent.network_settings['activation_function'])
            agent_params.network_wrappers['main'].middleware_parameters = \
                create_middle_embedder(agent.network_settings['middleware_embedders'],
                                       agent.network_settings['embedder_type'],
                                       agent.network_settings['activation_function'])

            input_filter = InputFilter(is_a_reference_filter=True)
            for observation in agent.network_settings['input_embedders'].keys(
            ):
                if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or\
                observation == Input.OBSERVATION.value:
                    input_filter.add_observation_filter(
                        observation, 'to_grayscale', ObservationRGBToYFilter())
                    input_filter.add_observation_filter(
                        observation, 'to_uint8',
                        ObservationToUInt8Filter(0, 255))
                    input_filter.add_observation_filter(
                        observation, 'stacking', ObservationStackingFilter(1))

                if observation == Input.STEREO.value:
                    input_filter.add_observation_filter(
                        observation, 'to_uint8',
                        ObservationToUInt8Filter(0, 255))

                if observation == Input.LIDAR.value:
                    input_filter.add_observation_filter(
                        observation, 'clipping',
                        ObservationClippingFilter(0.15, 1.0))
                if observation == Input.SECTOR_LIDAR.value:
                    input_filter.add_observation_filter(
                        observation, 'binary', ObservationBinarySectorFilter())
            agent_params.input_filter = input_filter()

            agent_params.network_wrappers['main'].batch_size = params[
                "batch_size"]
            agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
            agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999

            if params["loss_type"] == "huber":
                agent_params.network_wrappers[
                    'main'].replace_mse_with_huber_loss = True

            agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2
            agent_params.algorithm.clipping_decay_schedule = LinearSchedule(
                1.0, 0, 1000000)
            agent_params.algorithm.beta_entropy = params["beta_entropy"]
            agent_params.algorithm.gae_lambda = 0.95
            agent_params.algorithm.discount = params["discount_factor"]
            agent_params.algorithm.optimization_epochs = params["num_epochs"]
            agent_params.algorithm.estimate_state_value_using_gae = True
            agent_params.algorithm.num_steps_between_copying_online_weights_to_target = \
                EnvironmentEpisodes(params["num_episodes_between_training"])
            agent_params.algorithm.num_consecutive_playing_steps = \
                EnvironmentEpisodes(params["num_episodes_between_training"])

            agent_params.algorithm.distributed_coach_synchronization_type = \
                DistributedCoachSynchronizationType.SYNC

            if params["exploration_type"] == "categorical":
                agent_params.exploration = CategoricalParameters()
            else:
                agent_params.exploration = EGreedyParameters()
                agent_params.exploration.epsilon_schedule = LinearSchedule(
                    1.0, params["e_greedy_value"], params["epsilon_steps"])

            trainable_agents_list.append(agent_params)
        else:
            non_trainable_agents_list.append(agent)

    ###############
    # Environment #
    ###############
    env_params = DeepRacerRacetrackEnvParameters()
    env_params.agents_params = trainable_agents_list
    env_params.non_trainable_agents = non_trainable_agents_list
    env_params.level = 'DeepRacerRacetrackEnv-v0'
    env_params.run_phase_subject = run_phase_subject

    vis_params = VisualizationParameters()
    vis_params.dump_mp4 = False

    ########
    # Test #
    ########
    preset_validation_params = PresetValidationParameters()
    preset_validation_params.test = True
    preset_validation_params.min_reward_threshold = 400
    preset_validation_params.max_episodes_to_achieve_reward = 10000

    graph_manager = MultiAgentGraphManager(
        agents_params=trainable_agents_list,
        env_params=env_params,
        schedule_params=schedule_params,
        vis_params=vis_params,
        preset_validation_params=preset_validation_params)
    return graph_manager, params_json
Exemplo n.º 26
0
agent_params.algorithm.discount = 0.99

# to jump start the agent's q values, and speed things up, we'll initialize the last Dense layer
# with something in the order of the discounted reward of a random policy
agent_params.network_wrappers['main'].heads_parameters = \
[QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]

# NN configuration
agent_params.network_wrappers['main'].learning_rate = 0.0001
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False

# ER - we'll be needing an episodic replay buffer for off-policy evaluation
agent_params.memory = EpisodicExperienceReplayParameters()

# E-Greedy schedule - there is no exploration in Batch RL. Disabling E-Greedy. 
agent_params.exploration.epsilon_schedule = LinearSchedule(initial_value=0, final_value=0, decay_steps=1)
agent_params.exploration.evaluation_epsilon = 0

# can use either a kNN or a NN based model for predicting which actions not to max over in the bellman equation
#agent_params.algorithm.action_drop_method_parameters = KNNParameters()


DATATSET_PATH = 'acrobot_dataset.csv'
agent_params.memory = EpisodicExperienceReplayParameters()
agent_params.memory.load_memory_from_file_path = CsvDataset(DATATSET_PATH, is_episodic = True)

spaces = SpacesDefinition(state=StateSpace({'observation': VectorObservationSpace(shape=6)}),
                          goal=None,
                          action=DiscreteActionSpace(3),
                          reward=RewardSpace(1))
Exemplo n.º 27
0
agent_params.network_wrappers['main'].learning_rate = 0.0003
agent_params.network_wrappers['main'].input_embedders_parameters[
    'observation'].activation_function = 'tanh'
agent_params.network_wrappers['main'].input_embedders_parameters[
    'observation'].scheme = [Dense(64)]
agent_params.network_wrappers['main'].middleware_parameters.scheme = [
    Dense(64)
]
agent_params.network_wrappers[
    'main'].middleware_parameters.activation_function = 'tanh'
agent_params.network_wrappers['main'].batch_size = 64
agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999

agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2
agent_params.algorithm.clipping_decay_schedule = LinearSchedule(
    1.0, 0, 1000000)
agent_params.algorithm.beta_entropy = 0
agent_params.algorithm.gae_lambda = 0.95
agent_params.algorithm.discount = 0.99
agent_params.algorithm.optimization_epochs = 10
agent_params.algorithm.estimate_state_value_using_gae = True
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
    2048)

# Distributed Coach synchronization type.
agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC

agent_params.exploration = EGreedyParameters()
agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.01, 10000)
agent_params.pre_network_filter.add_observation_filter(
    'observation', 'normalize_observation',
# agent_params.algorithm.action_drop_method_parameters = KNNParameters()
agent_params.algorithm.action_drop_method_parameters = NNImitationModelParameters()

# NN configuration
agent_params.network_wrappers['main'].learning_rate = 0.0001
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
agent_params.network_wrappers['main'].softmax_temperature = 0.2

# ER size
agent_params.memory = EpisodicExperienceReplayParameters()
# DATATSET_PATH = 'acrobot.csv'
# agent_params.memory.load_memory_from_file_path = CsvDataset(DATATSET_PATH, True)

# E-Greedy schedule
agent_params.exploration.epsilon_schedule = LinearSchedule(0, 0, 10000)
agent_params.exploration.evaluation_epsilon = 0

# Experience Generating Agent parameters
experience_generating_agent_params = DDQNAgentParameters()

# schedule parameters
experience_generating_schedule_params = ScheduleParameters()
experience_generating_schedule_params.heatup_steps = EnvironmentSteps(1000)
experience_generating_schedule_params.improve_steps = TrainingSteps(
    DATASET_SIZE - experience_generating_schedule_params.heatup_steps.num_steps)
experience_generating_schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
experience_generating_schedule_params.evaluation_steps = EnvironmentEpisodes(1)

# DQN params
experience_generating_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(100)
    ]

agent_params.network_wrappers['main'].input_embedders_parameters[
    'observation'].activation_function = 'relu'
agent_params.network_wrappers[
    'main'].middleware_parameters.activation_function = 'relu'
#agent_params.network_wrappers['main'].middleware_parameters.scheme = [
#    Conv2dWithAttention(64, 3, 1, 1000)
#]

agent_params.network_wrappers['main'].batch_size = 64
agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999

agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2
agent_params.algorithm.clipping_decay_schedule = LinearSchedule(
    1.0, 0, 1000000)
agent_params.algorithm.beta_entropy = 0.01  # also try 0.001
agent_params.algorithm.gae_lambda = 0.95
agent_params.algorithm.discount = 0.999
agent_params.algorithm.optimization_epochs = 10
agent_params.algorithm.estimate_state_value_using_gae = True
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes(
    20)
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(20)

agent_params.exploration = CategoricalParameters()

agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC

###############
# Environment #
Exemplo n.º 30
0
schedule_params = ScheduleParameters()
schedule_params.improve_steps = TrainingSteps(10000000000)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(50)
schedule_params.evaluation_steps = EnvironmentEpisodes(3)
schedule_params.heatup_steps = EnvironmentSteps(1000)


#########
# Agent #
#########
agent_params = DDQNAgentParameters()
agent_params.memory.max_size = (MemoryGranularity.Transitions, 5000)
agent_params.network_wrappers['main'].learning_rate = 0.00025
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1000)
agent_params.exploration.epsilon_schedule = LinearSchedule(0.5, 0.01, 50000)
agent_params.exploration.evaluation_epsilon = 0
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(1)
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
agent_params.network_wrappers['main'].heads_parameters = [DuelingQHeadParameters()]

###############
# Environment #
###############
env_params = DoomEnvironmentParameters()
env_params.level = 'basic'

vis_params = VisualizationParameters()
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
vis_params.dump_mp4 = False