Exemplo n.º 1
0
def set_agent_params(agent_params_func):
    #########
    # Agent #
    #########
    agent_params = agent_params_func()
    agent_params.network_wrappers['main'].batch_size = 128
    agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(
        100)
    agent_params.algorithm.discount = 0.99

    # to jump start the agent's q values, and speed things up, we'll initialize the last Dense layer's bias
    # with a number in the order of the discounted reward of a random policy
    agent_params.network_wrappers['main'].heads_parameters = \
        [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]
    # agent_params.network_wrappers['main'].heads_parameters = \
    #     [QHeadParameters(output_bias_initializer=tf.constant_initializer(0))]

    # NN configuration
    agent_params.network_wrappers['main'].learning_rate = 0.0001
    agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
    agent_params.network_wrappers['main'].softmax_temperature = 0.2

    # ER - we'll need an episodic replay buffer for off-policy evaluation
    agent_params.memory = EpisodicExperienceReplayParameters()

    # E-Greedy schedule - there is no exploration in Batch RL. Disabling E-Greedy.
    agent_params.exploration.epsilon_schedule = LinearSchedule(initial_value=0,
                                                               final_value=0,
                                                               decay_steps=1)
    agent_params.exploration.evaluation_epsilon = 0
    return agent_params
Exemplo n.º 2
0
 def __init__(self):
     super().__init__()
     self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
     self.middleware_parameters = FCMiddlewareParameters(scheme=MiddlewareScheme.Medium)
     self.heads_parameters = [QHeadParameters()]
     self.optimizer_type = 'Adam'
     self.batch_size = 32
     self.replace_mse_with_huber_loss = True
     self.create_target_network = True
 def __init__(self):
     super().__init__()
     self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
     self.middleware_parameters = FCMiddlewareParameters()
     self.heads_parameters = [QHeadParameters(loss_weight=0.5), ACERPolicyHeadParameters(loss_weight=1.0)]
     self.optimizer_type = 'Adam'
     self.async_training = True
     self.clip_gradients = 40.0
     self.create_target_network = True
Exemplo n.º 4
0
 def __init__(self):
     super().__init__()
     self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
     self.middleware_parameters = FCMiddlewareParameters()
     self.heads_parameters = [QHeadParameters()]
     self.optimizer_type = 'Adam'
     self.async_training = True
     self.shared_optimizer = True
     self.create_target_network = True
Exemplo n.º 5
0
 def __init__(self):
     super().__init__()
     self.input_embedders_parameters = {
         'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty)
     }
     self.middleware_parameters = VGG16MiddlewareParameters(
         scheme=MiddlewareScheme.Medium
     )  #FCMiddlewareParameters(scheme=MiddlewareScheme.Medium)#
     self.heads_parameters = [QHeadParameters()]
     self.optimizer_type = 'Adam'
     self.batch_size = 32
     self.replace_mse_with_huber_loss = True
     self.create_target_network = True
     self.should_get_softmax_probabilities = False
Exemplo n.º 6
0
tf.reset_default_graph() # just to clean things up; only needed for the tutorial

#########
# Agent #
#########

agent_params = DQNAgentParameters()
agent_params.network_wrappers['main'].batch_size = 128
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(100)
agent_params.algorithm.discount = 0.99

# to jump start the agent's q values, and speed things up, we'll initialize the last Dense layer
# with something in the order of the discounted reward of a random policy
agent_params.network_wrappers['main'].heads_parameters = \
[QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]

# NN configuration
agent_params.network_wrappers['main'].learning_rate = 0.0001
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False

# ER - we'll be needing an episodic replay buffer for off-policy evaluation
agent_params.memory = EpisodicExperienceReplayParameters()

# E-Greedy schedule - there is no exploration in Batch RL. Disabling E-Greedy. 
agent_params.exploration.epsilon_schedule = LinearSchedule(initial_value=0, final_value=0, decay_steps=1)
agent_params.exploration.evaluation_epsilon = 0

# can use either a kNN or a NN based model for predicting which actions not to max over in the bellman equation
#agent_params.algorithm.action_drop_method_parameters = KNNParameters()
schedule_params = ScheduleParameters()
# schedule_params.improve_steps = TrainingSteps(10000000000)
schedule_params.improve_steps = TrainingSteps(400)      # 400 epochs
schedule_params.steps_between_evaluation_periods = TrainingSteps(1)
schedule_params.evaluation_steps = EnvironmentEpisodes(10)
schedule_params.heatup_steps = EnvironmentSteps(DATASET_SIZE)

#########
# Agent #
#########

agent_params = DDQNBCQAgentParameters()
agent_params.network_wrappers['main'].batch_size = 128
# TODO cross-DL framework abstraction for a constant initializer?
agent_params.network_wrappers['main'].heads_parameters = [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]

agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(100)
# agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(500)
agent_params.algorithm.discount = 0.99

# agent_params.algorithm.action_drop_method_parameters = KNNParameters()
agent_params.algorithm.action_drop_method_parameters = NNImitationModelParameters()

# NN configuration
agent_params.network_wrappers['main'].learning_rate = 0.0001
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
agent_params.network_wrappers['main'].softmax_temperature = 0.2

# ER size
agent_params.memory = EpisodicExperienceReplayParameters()
Exemplo n.º 8
0
def train_using_experience_agent(env_params, n_epochs, dataset_size):
    tf.reset_default_graph(
    )  # just to clean things up; only needed for the tutorial

    # Experience Generating Agent parameters
    experience_generating_agent_params = DDQNAgentParameters()
    # schedule parameters
    experience_generating_schedule_params = ScheduleParameters()
    experience_generating_schedule_params.heatup_steps = EnvironmentSteps(1000)
    experience_generating_schedule_params.improve_steps = TrainingSteps(
        dataset_size -
        experience_generating_schedule_params.heatup_steps.num_steps)
    experience_generating_schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(
        10)
    experience_generating_schedule_params.evaluation_steps = EnvironmentEpisodes(
        1)

    # DQN params
    experience_generating_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
        100)
    experience_generating_agent_params.algorithm.discount = 0.99
    experience_generating_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(
        1)

    # NN configuration
    experience_generating_agent_params.network_wrappers[
        'main'].learning_rate = 0.0001
    experience_generating_agent_params.network_wrappers[
        'main'].batch_size = 128
    experience_generating_agent_params.network_wrappers[
        'main'].replace_mse_with_huber_loss = False
    experience_generating_agent_params.network_wrappers['main'].heads_parameters = \
        [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]
    # experience_generating_agent_params.network_wrappers['main'].heads_parameters = \
    #     [QHeadParameters(output_bias_initializer=tf.constant_initializer(0))]

    # ER size
    experience_generating_agent_params.memory = EpisodicExperienceReplayParameters(
    )
    experience_generating_agent_params.memory.max_size = \
        (MemoryGranularity.Transitions,
         experience_generating_schedule_params.heatup_steps.num_steps +
         experience_generating_schedule_params.improve_steps.num_steps)

    # E-Greedy schedule
    experience_generating_agent_params.exploration.epsilon_schedule = LinearSchedule(
        1.0, 0.01, DATASET_SIZE)
    experience_generating_agent_params.exploration.evaluation_epsilon = 0

    schedule_params = set_schedule_params(n_epochs, dataset_size)
    # set the agent params as before
    # agent_params = set_agent_params(DDQNAgentParameters)
    agent_params = set_agent_params(DDQNBCQAgentParameters)
    agent_params.algorithm.action_drop_method_parameters = NNImitationModelParameters(
    )

    # 50 epochs of training (the entire dataset is used each epoch)
    # schedule_params.improve_steps = TrainingSteps(50)

    graph_manager = BatchRLGraphManager(
        agent_params=agent_params,
        experience_generating_agent_params=experience_generating_agent_params,
        experience_generating_schedule_params=
        experience_generating_schedule_params,
        env_params=env_params,
        schedule_params=schedule_params,
        vis_params=VisualizationParameters(
            dump_signals_to_csv_every_x_episodes=1),
        reward_model_num_epochs=30,
        train_to_eval_ratio=0.5)
    graph_manager.create_graph(task_parameters)
    graph_manager.improve()
    return