Пример #1
0
def parse_policy(args) -> Policy:
    pol: Policy = EpsGreedyQPolicy()
    if args.policy == 'LinearAnnealedPolicy':
        pol = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                   attr='eps',
                                   value_max=1.,
                                   value_min=.1,
                                   value_test=0.05,
                                   nb_steps=args.zeta_nb_steps)
    if args.policy == 'SoftmaxPolicy':
        pol = SoftmaxPolicy()
    if args.policy == 'EpsGreedyQPolicy':
        pol = EpsGreedyQPolicy()
    if args.policy == 'GreedyQPolicy':
        pol = GreedyQPolicy()
    if args.policy == 'BoltzmannQPolicy':
        pol = BoltzmannQPolicy()
    if args.policy == 'MaxBoltzmannQPolicy':
        pol = MaxBoltzmannQPolicy()
    if args.policy == 'BoltzmannGumbelQPolicy':
        pol = BoltzmannGumbelQPolicy()
    if args.policy == 'ZetaPolicy':
        pol = ZetaPolicy(zeta_nb_steps=args.zeta_nb_steps, eps=args.eps)

    return pol
Пример #2
0
def main(model_dir, visualize, params):
    print(params)

    env = gym.make('CartPole-v1')
    env._max_episode_steps = 2000

    env = trl.env.TimeExpanded(env, 3)

    np.random.seed(params.seed)
    env.seed(params.seed)

    agent = trl.prototype.DQN(model_fn, model_dir, params=params)

    agent.train(
        env,
        lambda: input_fn(env),
        max_steps=params.max_steps,
        policy=MaxBoltzmannQPolicy(eps=0.9),
        memory=SequentialMemory(
            limit=params.memory_limit,
            window_length=1,
        ),
        target_model_update=params.target_model_update,
        gamma=params.gamma,
        warmup_steps=params.warmup_steps,
        batch_size=params.batch_size,
        summary_steps=params.summary_steps,
        save_steps=params.save_steps,
        visualize=visualize,
        seed=params.seed,
    )
Пример #3
0
def _main():
    logging.basicConfig(
        stream=sys.stderr,
        level=logging.WARNING,  # if args.verbose > 0 else logging.INFO,
        format="%(levelname)-4.4s [%(name)s:%(lineno)s] %(message)s",
    )

    nb_actions = FREnv.action_space.n
    nb_steps = 1_000_000

    model = Sequential()
    model.add(Flatten(input_shape=(1, ) + FREnv.observation_space.shape))
    model.add(Dense(nb_actions * 8))
    model.add(Activation("relu"))
    # model.add(Dense(nb_actions * 4))
    # model.add(Activation('relu'))
    model.add(Dense(nb_actions * 2))
    model.add(Activation("relu"))
    model.add(Dense(nb_actions))
    model.add(Activation("linear"))
    print(model.summary())

    memory = SequentialMemory(limit=50000, window_length=1)
    policy = LinearAnnealedPolicy(
        inner_policy=MaxBoltzmannQPolicy(),
        attr="eps",
        value_max=1,
        value_min=0.05,
        value_test=0,
        nb_steps=nb_steps // 2,
    )  # BoltzmannQPolicy()
    agent = AvailableAgent(
        model=model,
        gamma=0.9999,
        nb_actions=nb_actions,
        memory=memory,
        nb_steps_warmup=50,
        target_model_update=1e-2,
        policy=policy,
        test_policy=policy,
    )
    agent.compile(Adam(lr=1e-3), metrics=["mae"])

    if os.path.isfile(WEIGHTS_FILE):
        print(f"loading pre-trained weights from {WEIGHTS_FILE}")
        agent.load_weights(WEIGHTS_FILE)

    env = FREnv(team=AgentTeam(agent=agent, colors="blue"))
    agent.fit(env, nb_steps=nb_steps, visualize=False, verbose=1)

    agent.save_weights(WEIGHTS_FILE, overwrite=True)

    agent.test(env, nb_episodes=1, visualize=True)
Пример #4
0
def create_dqn(model, nb_actions):
    """Creates and compiles a DQN agent with an Adam optimizer."""
    memory = SequentialMemory(limit=100000, window_length=1)
    policy = MaxBoltzmannQPolicy(tau=10, eps=0.2)
    dqn = DQNAgent(model=model,
                   nb_actions=nb_actions,
                   memory=memory,
                   target_model_update=1e-2,
                   policy=policy,
                   gamma=0.995,
                   batch_size=64)
    dqn.compile(Adam(lr=5e-4, decay=0.0), metrics=['mae'])

    return dqn
Пример #5
0
    def build_agent(self, mem_file=None, w_file=None):
        #Create a dummy env to get size of input/output.
        #Makes it simpler if we ever choose to update env shapes.
        env = TradingEnv([], "", [])
        np.random.seed(314)
        env.seed(314)

        nb_actions = env.action_space.n
        obs_dim = env.observation_space.shape[0]
        model = Sequential()
        model.add(
            LSTM(5, input_shape=(7, 4),
                 return_sequences=True))  # 4 features + 1 bias term. 5 neurons
        model.add(Activation('tanh'))
        model.add(LSTM(4))
        model.add(Activation('tanh'))
        model.add(Dropout(0.2))
        model.add(Dense(4))
        model.add(Activation('relu'))
        model.add(Dense(nb_actions))
        model.add(Activation('linear'))  #Best activation for BoltzmanPolicy

        #policy = EpsGreedyQPolicy(eps=EPS_VAL) #Off policy
        policy = BoltzmannQPolicy()  #Off-policy
        test_policy = MaxBoltzmannQPolicy()  #On-policy
        memory = None
        if mem_file is None:
            memory = SequentialMemory(
                limit=50000,
                window_length=7)  ## returns observations of len (7,)
        else:
            (memory, memory.actions, memory.rewards, memory.terminals,
             memory.observations) = pickle.load(open(mem_file, "rb"))

        dqn = DQNAgent(model=model,
                       nb_actions=nb_actions,
                       memory=memory,
                       gamma=GAMMA_VAL,
                       nb_steps_warmup=100,
                       policy=policy,
                       test_policy=test_policy)
        dqn.compile("adam", metrics=['mse'])

        if w_file is not None:
            model.load_weights(w_file)

        return dqn, env, memory
def create_dqn(model, history_length):
    memory = SequentialMemory(limit=500000, window_length=history_length)
    policy = MaxBoltzmannQPolicy()

    dqn = DQNAgent(
        model=model,
        nb_actions=model.output_shape[1],
        memory=memory,
        policy=policy,
        processor=CustomProcessor(),
        nb_steps_warmup=512,
        enable_dueling_network=True,
        dueling_type='avg',
        target_model_update=5e2,
        batch_size=32,
    )
    dqn.compile(Adam(lr=1e-3), metrics=['mae'])

    return dqn
Пример #7
0
def create_dqn(model,
               log_interval=50000,
               model_name='dqn_agent_checkpoint',
               file_log_path='./logs/log.txt',
               tensorboard_path='./logs/tensorboard/'):
    model_path = './models/' + model_name + '.h5'
    file_logger = FileLogger(file_log_path, interval=log_interval)
    checkpoint = ModelIntervalCheckpoint(model_path, interval=log_interval)
    tensorboard = TensorboardLogger(tensorboard_path)
    callbacks = [file_logger, checkpoint, tensorboard]

    # Use 4 last observation - history_length = 4
    memory = SequentialMemory(limit=500000, window_length=history_length)

    # Use combine of BoltzmannQPolicy and EpsGreedyQPolicy
    policy = MaxBoltzmannQPolicy()

    # Set epsilon to 1.0 and decrease it over every step to stop taking random action when map is explored
    policy = LinearAnnealedPolicy(inner_policy=policy,
                                  attr='eps',
                                  value_max=1.0,
                                  value_min=0.1,
                                  value_test=0.04,
                                  nb_steps=NUMBER_OF_STEPS)

    # Create an instance of DQNAgent from keras-rl
    dqn = DQNAgent(model=model,
                   nb_actions=env.action_space.n,
                   memory=memory,
                   policy=policy,
                   processor=CustomProcessor(),
                   nb_steps_warmup=512,
                   enable_dueling_network=True,
                   dueling_type='avg',
                   target_model_update=5e2,
                   batch_size=32)

    dqn.compile(Adam(lr=5e-4), metrics=['mae'])

    return dqn, callbacks
Пример #8
0
map = Dense(map_size * 2)(map)
map = Activation('tanh')(map)
map = Dense(map_size)(map)
map = Activation('tanh')(map)

merged = Concatenate()([map, position])
merged = Dense(nb_neuron_input * 2, activation='tanh')(merged)
merged = Dense(nb_neuron_input, activation='tanh')(merged)
merged = Dense(nb_neuron_output, activation='softmax')(merged)

model = Model(inputs=[inputs], outputs=[merged])
model.summary()
model.compile(Adam(), loss='mean_squared_error')

memory = SequentialMemory(limit=50000, window_length=1)
policy = MaxBoltzmannQPolicy()
dqn = DQNAgent(model=model,
               nb_actions=nb_neuron_output,
               memory=memory,
               nb_steps_warmup=10,
               target_model_update=1e-2,
               policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae', 'accuracy'])

metrics = Metrics(dqn, env)
#fileName = '1D_advanced_Sequential1000_BoltzmannQ_10000steps(7)'
#fileName = '1D_advanced_Sequential1000_EpsGreedyQ_10000steps(7)'
#fileName = '1D_advanced_Sequential1000_MaxBoltzmannQ_10000steps(7)'
#fileName = '1D_advanced_Sequential50000_BoltzmannQPolicy_10000steps(7)'
#fileName = '1D_advanced_Sequential50000_MaxBoltzmannQ_1000000steps(0)'
fileName = '1D__Sequential50000_BoltzmannQ_1000000steps(0)'
Пример #9
0
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))
print(model.summary())

memory = SequentialMemory(limit=100000, window_length=1)
if args.mode == 'train':
    policy = LinearAnnealedPolicy(EpsStochasticPolicy(),
                                  attr='eps',
                                  value_max=1.,
                                  value_min=.25,
                                  value_test=.05,
                                  nb_steps=20000)
else:
    policy = EpsStochasticPolicy(eps=.25)
test_policy = MaxBoltzmannQPolicy(eps=0.1)

# dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=200,
#               enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy)
dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               memory=memory,
               nb_steps_warmup=10,
               enable_dueling_network=True,
               dueling_type='avg',
               target_model_update=1e-2,
               policy=policy,
               test_policy=test_policy)
dqn.compile(Adam(lr=0.0003, ), metrics=['mae'])

if args.mode == 'resume':
Пример #10
0
    def create_model(self):
        # Simple model where only one layer feeds into the next
        self._model = Sequential()

        # Get initializer for hidden layers
        init = tf.keras.initializers.RandomNormal(mean=.1, stddev=.02)

        # Input Layer; this shape is one that just works
        self._model.add(
            Dense(512,
                  input_shape=(1, 7814),
                  activation="relu",
                  use_bias=False,
                  kernel_initializer=init,
                  name='first_hidden'))

        # Hidden Layers
        self._model.add(
            Flatten(name='flatten')
        )  # Flattening resolve potential issues that would arise otherwise
        self._model.add(
            Dense(256,
                  activation="relu",
                  use_bias=False,
                  kernel_initializer=init,
                  name='second_hidden'))

        # Output Layer
        self._model.add(
            Dense(len(self._ACTION_SPACE),
                  use_bias=False,
                  kernel_initializer=init,
                  name='final'))
        self._model.add(
            BatchNormalization()
        )  # Increases speed: https://www.dlology.com/blog/one-simple-trick-to-train-keras-model-faster-with-batch-normalization/
        self._model.add(
            Activation("linear")
        )  # Same as passing activation in Dense Layer, but allows us to access last layer: https://stackoverflow.com/questions/40866124/difference-between-dense-and-activation-layer-in-keras

        # This is how many battles we'll remember before we start forgetting old ones
        self._memory = SequentialMemory(limit=max(num_battles, 10000),
                                        window_length=1)

        # Simple epsilon greedy policy
        # This takes the output of our NeuralNet and converts it to a value
        # Softmax is another probabilistic option: https://github.com/keras-rl/keras-rl/blob/master/rl/policy.py#L120
        self._policy = LinearAnnealedPolicy(
            MaxBoltzmannQPolicy(),
            attr="eps",
            value_max=1.0,
            value_min=0.05,
            value_test=0,
            nb_steps=num_battles,
        )

        # Defining our DQN
        self._dqn = DQNAgent(
            model=self._model,
            nb_actions=len(action_space),
            policy=self._policy,
            memory=self._memory,
            nb_steps_warmup=max(
                1000, int(num_battles / 10)
            ),  # The number of battles we go through before we start training: https://hub.packtpub.com/build-reinforcement-learning-agent-in-keras-tutorial/
            gamma=
            0.8,  # This is the discount factor for the Value we learn - we care a lot about future rewards
            target_model_update=
            .01,  # This controls how much/when our model updates: https://github.com/keras-rl/keras-rl/issues/55
            delta_clip=
            1,  # Helps define Huber loss - cips values to be -1 < x < 1. https://srome.github.io/A-Tour-Of-Gotchas-When-Implementing-Deep-Q-Networks-With-Keras-And-OpenAi-Gym/
            enable_double_dqn=True,
        )

        self._dqn.compile(Adam(lr=0.01), metrics=["mae"])
Пример #11
0
model.add(Dense(128, activation="elu"))
model.add(Dropout(DROP))
model.add(Dense(nb_actions, activation="linear"))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(MaxBoltzmannQPolicy(),
                              attr='eps',
                              value_max=1.,
                              value_min=.05,
                              value_test=.001,
                              nb_steps=1000000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!
# policy = LinearAnnealedPolicy(BoltzmannQPolicy(), attr='tau', value_max=10., value_min=.1, value_test=.05, nb_steps=1000000)

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,