예제 #1
0
def main():
    # Create env
    np.random.seed(SEED)    
    env = PentagoEnv(SIZE, agent_starts = AGENT_STARTS)
    env.seed(SEED)
    nb_actions = env.action_space.n

    # Define model
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(128, activation='sigmoid'))
    model.add(Dense(nb_actions))
    print(model.summary())

    # Configure and compile  agent
    memory = SequentialMemory(limit=5000, window_length=1)
    policy = BoltzmannQPolicy()
    dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000,
                    target_model_update=1000, policy=policy)
    optimizer=RMSprop(lr=0.00025, epsilon=0.01)
    dqn.compile(optimizer)

    # Okay, now it's time to learn something! We visualize the training here for show, but this
    # slows down training quite a lot. You can always safely abort the training prematurely using
    # Ctrl + C.
    dqn.fit(env, nb_steps=50000, visualize=True, verbose=1)

    # After training is done, we save the final weights.
    dqn.save_weights('weights/dqn-{}-weights-{}.h5f'.format(TAG, datetime.datetime.now()))    
예제 #2
0
def main():
    np.random.seed(123)    
    env = PentagoEnv(SIZE)
    env.seed(123)
    nb_actions = env.action_space.n

    model = Sequential()
    #model.add(Reshape((SIZE ** 2,), input_shape=(SIZE, SIZE)))
    model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(128, activation='sigmoid'))
    model.add(Dense(nb_actions))
    print(model.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=5000, window_length=1)
    policy = BoltzmannQPolicy()
    dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000,
                    target_model_update=1e-2, policy=policy)
    optimizer=RMSprop(lr=0.00025, epsilon=0.01)
    dqn.compile(optimizer)

    # Okay, now it's time to learn something! We visualize the training here for show, but this
    # slows down training quite a lot. You can always safely abort the training prematurely using
    # Ctrl + C.
    dqn.fit(env, nb_steps=50000, visualize=True, verbose=1)

    # After training is done, we save the final weights.
    dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)    
예제 #3
0
def test_single_dqn_input():
    model = Sequential()
    model.add(Flatten(input_shape=(2, 3)))
    model.add(Dense(2))

    memory = SequentialMemory(limit=10, window_length=2)
    for double_dqn in (True, False):
        agent = DQNAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4,
                         enable_double_dqn=double_dqn)
        agent.compile('sgd')
        agent.fit(MultiInputTestEnv((3,)), nb_steps=10)
예제 #4
0
class DQN(BaseAgent):
  def __init__(self, model, processor, policy, test_policy, num_actions):
    # Replay memory
    memory = SequentialMemory(limit=opt.dqn_replay_memory_size,
                              window_length=opt.dqn_window_length)
    self.agent = DQNAgent(model=model,
                          nb_actions=num_actions,
                          policy=policy,
                          test_policy=test_policy,
                          memory=memory,
                          processor=processor,
                          batch_size=opt.dqn_batch_size,
                          nb_steps_warmup=opt.dqn_nb_steps_warmup,
                          gamma=opt.dqn_gamma,
                          target_model_update=opt.dqn_target_model_update,
                          enable_double_dqn=opt.enable_double_dqn,
                          enable_dueling_network=opt.enable_dueling_network,
                          train_interval=opt.dqn_train_interval,
                          delta_clip=opt.dqn_delta_clip)
    self.agent.compile(optimizer=keras.optimizers.Adam(lr=opt.dqn_learning_rate), metrics=['mae'])

  def fit(self, env, num_steps, weights_path=None, visualize=False):
    callbacks = []
    if weights_path is not None:
      callbacks += [ModelIntervalCheckpoint(weights_path, interval=50000, verbose=1)]
    self.agent.fit(env=env,
                   nb_steps=num_steps,
                   action_repetition=opt.dqn_action_repetition,
                   callbacks=callbacks,
                   log_interval=opt.log_interval,
                   test_interval=opt.test_interval,
                   test_nb_episodes=opt.test_nb_episodes,
                   test_action_repetition=opt.dqn_action_repetition,
                   visualize=visualize,
                   test_visualize=visualize,
                   verbose=1)

  def test(self, env, num_episodes, visualize=False):
    self.agent.test(env=env,
                    nb_episodes=num_episodes,
                    action_repetition=opt.dqn_action_repetition,
                    verbose=2,
                    visualize=visualize)

  def save(self, out_dir):
    self.agent.save_weights(out_dir, overwrite=True)

  def load(self, out_dir):
    self.agent.load_weights(out_dir)
예제 #5
0
def test_multi_dqn_input():
    input1 = Input(shape=(2, 3))
    input2 = Input(shape=(2, 4))
    x = Concatenate()([input1, input2])
    x = Flatten()(x)
    x = Dense(2)(x)
    model = Model(inputs=[input1, input2], outputs=x)

    memory = SequentialMemory(limit=10, window_length=2)
    processor = MultiInputProcessor(nb_inputs=2)
    for double_dqn in (True, False):
        agent = DQNAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4,
                         processor=processor, enable_double_dqn=double_dqn)
        agent.compile('sgd')
        agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10)
def train_dqn_model(layers, rounds=10000, run_test=False, use_score=False):
    ENV_NAME = 'malware-score-v0' if use_score else 'malware-v0'
    env = gym.make(ENV_NAME)
    env.seed(123)
    nb_actions = env.action_space.n
    window_length = 1  # "experience" consists of where we were, where we are now

    # generate a policy model
    model = generate_dense_model((window_length,) + env.observation_space.shape, layers, nb_actions)

    # configure and compile our agent
    # BoltzmannQPolicy selects an action stochastically with a probability generated by soft-maxing Q values
    policy = BoltzmannQPolicy()

    # memory can help a model during training
    # for this, we only consider a single malware sample (window_length=1) for each "experience"
    memory = SequentialMemory(limit=32, ignore_episode_boundaries=False, window_length=window_length)

    # DQN agent as described in Mnih (2013) and Mnih (2015).
    # http://arxiv.org/pdf/1312.5602.pdf
    # http://arxiv.org/abs/1509.06461
    agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=16,
                     enable_double_dqn=True, enable_dueling_network=True, dueling_type='avg',
                     target_model_update=1e-2, policy=policy, batch_size=16)

    # keras-rl allows one to use and built-in keras optimizer
    agent.compile(RMSprop(lr=1e-3), metrics=['mae'])

    # play the game. learn something!
    agent.fit(env, nb_steps=rounds, visualize=False, verbose=2)

    history_train = env.history
    history_test = None

    if run_test:
        # Set up the testing environment
        TEST_NAME = 'malware-score-test-v0' if use_score else 'malware-test-v0'
        test_env = gym.make(TEST_NAME)

        # evaluate the agent on a few episodes, drawing randomly from the test samples
        agent.test(test_env, nb_episodes=100, visualize=False)
        history_test = test_env.history

    return agent, model, history_train, history_test
예제 #7
0
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

print(env.observation_space.shape)
model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               memory=memory,
               nb_steps_warmup=10,
               target_model_update=1e-2,
               policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot.
dqn.fit(env, nb_steps=5000, visualize=True)

dqn.test(env, nb_episodes=5, visualize=True)
예제 #8
0
        # 各ステップごと順番に学習させるわけではく、一度メモリに保存してからランダムに抽出と学習するとか
        # 正直、完全には理解できていません
        memory = SequentialMemory(limit=40000, window_length=1)

        # 行動方策はオーソドックスなepsilon-greedyです。
        policy = EpsGreedyQPolicy(eps=0.1)

        # warmup = 文字通り準備運動のイメージ いきなり学習させずにある程度メモリに貯めると思ってる
        # update = 学習率 小さくすると時間がかかるし、高くすると過学習しやすくなる
        dqn = DQNAgent(model=model,
                       nb_actions=nb_actions,
                       memory=memory,
                       nb_steps_warmup=100,
                       target_model_update=1e-2,
                       policy=policy)
        dqn.compile(Adam(lr=0.001))

        # nb_steps = 何ステップ学習させるか 数値をめちゃくちゃ大きくして、一晩経ったらCtrl+Cで止めるとかでも別にいい
        # max_episode_steps = 1エピソードの最大ステップ
        history = dqn.fit(env,
                          nb_steps=400000,
                          visualize=False,
                          verbose=2,
                          nb_max_episode_steps=1440)

        # modelとweightの保存
        now = datetime.now().strftime("%Y%m%d%H%M%S")
        dqn.save_weights('weight_' + str(now) + '.h5')
        model_json = model.to_json()
        with open('model_' + str(now) + '.json', "w") as json_file:
            json_file.write(model_json)
예제 #9
0
        # agent
        dqn = DQNAgent(model=expert_model,
                       nb_actions=nb_actions,
                       policy=policy,
                       memory=memory,
                       processor=processor,
                       enable_double_dqn=True,
                       enable_dueling_network=True,
                       gamma=.99,
                       target_model_update=10000,
                       train_interval=1,
                       delta_clip=1.,
                       nb_steps_warmup=50000)

        lr = .00025
        dqn.compile(Adam(lr), metrics=['mae'])
        weights_filename = model_saves + filename_append + "_" + datestr + "_" + 'expert_' + environment_name + '_weights.h5f'
        checkpoint_weights_filename = model_saves + filename_append + "_" + datestr + "_" + 'expert_' + environment_name + '_weights{step}.h5f'
        log_filename = model_saves + filename_append + "_" + datestr + "_" + 'expert_' + environment_name + '_REWARD_DATA.txt'
        callbacks = [
            TrainEpisodeLogger(log_filename),
            ModelIntervalCheckpoint(checkpoint_weights_filename,
                                    interval=1000000)
        ]
        if args.mode == 'train':
            dqn.fit(env,
                    callbacks=callbacks,
                    nb_steps=4250000,
                    verbose=0,
                    nb_max_episode_steps=1500)
            dqn.save_weights(weights_filename, overwrite=True)
예제 #10
0
def main():
    env = PikaEnv()
    nb_actions = env.action_space.n

    model = Sequential()
    model.add(Flatten(input_shape=(4, ) + env.observation_space.shape))
    model.add(Dense(512))
    model.add(Activation("relu"))
    model.add(Dense(512))
    model.add(Activation("relu"))
    model.add(Dense(512))
    model.add(Activation("relu"))
    model.add(Dense(nb_actions))
    model.add(Activation("linear"))
    print(model.summary())
    memory = SequentialMemory(limit=1_000_000, window_length=4)
    policy = LinearAnnealedPolicy(
        EpsGreedyQPolicy(),
        attr="eps",
        value_max=1.0,
        value_min=0.05,
        value_test=0.05,
        nb_steps=nb_steps // 4,
    )
    dqn = DQNAgent(
        model=model,
        nb_actions=nb_actions,
        policy=policy,
        memory=memory,
        enable_dueling_network=True,
        enable_double_dqn=True,
    )
    dqn.compile(Adam(lr=0.00025), metrics=["mae"])
    # dqn.load_weights(log_dir + "load.h5f")
    weights_filename = log_dir + "dqn_weights.h5f"
    checkpoint_weights_filename = log_dir + "dqn_weights_{step}.h5f"
    log_filename = log_dir + "dqn_log.json"
    callbacks = [
        ModelIntervalCheckpoint(checkpoint_weights_filename, interval=10000)
    ]
    callbacks += [FileLogger(log_filename, interval=100)]
    tbCallBack = TensorBoard(
        log_dir=tb_dir,
        histogram_freq=0,
        write_graph=True,
        write_grads=True,
        write_images=True,
        embeddings_freq=0,
        embeddings_layer_names=None,
        embeddings_metadata=None,
    )
    callbacks += [tbCallBack]
    dqn.fit(
        env,
        callbacks=callbacks,
        nb_steps=nb_steps,
        log_interval=10,
        visualize=True,
        verbose=2,
    )

    # After training is done, we save the final weights one more time.
    dqn.save_weights(weights_filename, overwrite=True)
예제 #11
0
    conc = concatenate([model_phase_encoded, model_vehicle_encoded])
    hidden = Dense(128)(conc)
    hidden = LeakyReLU()(hidden)
    hidden = Dense(64)(hidden)
    hidden = LeakyReLU()(hidden)
    output = Dense(nb_actions, activation='linear')(hidden)
    model = Model(inputs=[model_phase_input, model_vehicle_input], outputs=output)
    model_path = "dqn_model.h5"
    try:
        model.load_weights(model_path)
        print(f"Success loading previous weights at {model_path}")
    except BaseException as e:
        print(f"Did not load previous weights due to {e}, {model_path}")

    ### Policy, Memory & Agent set-up.
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.01, value_test=.01, nb_steps=100000)
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, policy=policy, batch_size=64, gamma=.95, nb_steps_warmup=2000, target_model_update=.001)
    dqn.processor = MultiInputProcessor(2)
    dqn.compile(optimizer=Adam(lr=.001))

    ### Fit.
    hist = dqn.fit(env, nb_steps=200, verbose=1, log_interval=10)
    dqn.save_weights(model_path,  overwrite=True)
    print("Saved model to disk")

    test_env = CityFlowAgent(mode='predict', config_path=config_path)
    start_time = default_timer()
    dqn.test(test_env, nb_episodes=1, visualize=False) 
    print(f"\n Done testing inn {default_timer()-start_time} seconds")
예제 #12
0
  · Millor respecte greedy perque no considera d'igual manera les opcions considerades no optimes
  · D'aquesta manera s'ignoren accions sub-optimes
  
Testejats diferents Learning rates i agafat el més optim

"""

memory = SequentialMemory(limit=1000000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               memory=memory,
               nb_steps_warmup=10,
               target_model_update=1e-2,
               policy=policy)
dqn.compile(Adam(lr=0.0015), metrics=['mae'])
"""
Entrenament per 150000 steps
"""
a = dqn.fit(env, nb_steps=150000, visualize=False, verbose=2)
"""
Carregar pesos, cuidado que et carregues l'entrenament
"""

weights_filename = 'dqn64_LunarLander-v2_weights.h5f'.format('LunarLander-v2')

dqn.load_weights(weights_filename)
"""
Test per 20 epochs
"""
dqn.test(env, nb_episodes=20, visualize=False)
예제 #13
0
def training_game():
    env = Environment(
        map_name="ForceField",
        visualize=True,
        game_steps_per_episode=150,
        agent_interface_format=features.AgentInterfaceFormat(
            feature_dimensions=features.Dimensions(screen=64, minimap=32)))

    input_shape = (_SIZE, _SIZE, 1)
    nb_actions = 12  # Number of actions

    model = neural_network_model(input_shape, nb_actions)
    memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH)

    processor = SC2Proc()

    # Policy

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr="eps",
                                  value_max=1,
                                  value_min=0.2,
                                  value_test=.0,
                                  nb_steps=1e2)

    # Agent

    dqn = DQNAgent(
        model=model,
        nb_actions=nb_actions,
        memory=memory,
        enable_double_dqn=True,
        enable_dueling_network=True,
        # 2019-07-12 GU Zhan (Sam) when value shape problem, reduce nb_steps_warmup:
        #                   nb_steps_warmup=300, target_model_update=1e-2, policy=policy,
        nb_steps_warmup=500,
        target_model_update=1e-2,
        policy=policy,
        batch_size=150,
        processor=processor,
        delta_clip=1)

    dqn.compile(Adam(lr=.001), metrics=["mae", "acc"])

    # Tensorboard callback

    timestamp = f"{datetime.datetime.now():%Y-%m-%d %I:%M%p}"
    # 2019-07-12 GU Zhan (Sam) folder name for Lunux:
    #    callbacks = keras.callbacks.TensorBoard(log_dir='./Graph/'+ timestamp, histogram_freq=0,
    #                                write_graph=True, write_images=False)

    # 2019-07-12 GU Zhan (Sam) folder name for Windows:
    callbacks = keras.callbacks.TensorBoard(log_dir='.\Graph\issgz',
                                            histogram_freq=0,
                                            write_graph=True,
                                            write_images=False)

    # Save the parameters and upload them when needed

    name = "agent"
    w_file = "dqn_{}_weights.h5f".format(name)
    check_w_file = "train_w" + name + "_weights.h5f"

    if SAVE_MODEL:
        check_w_file = "train_w" + name + "_weights_{step}.h5f"

    log_file = "training_w_{}_log.json".format(name)

    if LOAD_MODEL:
        dqn.load_weights(w_file)

    class Saver(Callback):
        def on_episode_end(self, episode, logs={}):
            if episode % 200 == 0:
                self.model.save_weights(w_file, overwrite=True)

    s = Saver()
    logs = FileLogger('DQN_Agent_log.csv', interval=1)

    dqn.fit(env,
            callbacks=[callbacks, s, logs],
            nb_steps=600,
            action_repetition=2,
            log_interval=1e4,
            verbose=2)

    dqn.save_weights(w_file, overwrite=True)
    dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
예제 #14
0
model = keras.layers.Flatten()(model)
model = keras.layers.Dense(512, activation='relu')(model)
model = keras.layers.Dense(4, activation='linear')(model)
model = keras.Model(inputs=input, outputs=model)
model.summary()
print(model.output)
model.output._keras_shape = (None, 4)
print(model.output._keras_shape)
game = gym.make('Breakout-v0')
agent = DQNAgent(model,
                 policy,
                 nb_actions=game.action_space.n,
                 nb_steps_warmup=50000,
                 memory=memory,
                 processor=AtariProcessor(),
                 train_interval=4,
                 delta_clip=1.)
agent.compile(keras.optimizers.Adam(lr=.00025), metrics=['mae'])
callbacks = [rl.callbacks.ModelIntervalCheckpoint('ckpt.h5f', interval=250000)]
callbacks += [FileLogger('log.json', interval=100)]
if False:
    agent.load_weights('weights.h5f')
agent.fit(game,
          nb_steps=1750000,
          visualize=False,
          log_interval=10000,
          callbacks=callbacks)
agent.save_weights('weights.h5f', overwrite=True)
game.reset()
agent.test(game, nb_episodes=10, visualize=True)
예제 #15
0
                              value_test=MIN_EPSILON,
                              nb_steps=int(TRAINING_STEPS*EPSILON_DECAY_PERIOD))

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               memory=memory,
               nb_steps_warmup=WARMUP_STEPS,
               processor=processor,
               target_model_update=TARGET_MODEL_UPDATE,
               policy=policy,
               train_interval=WINDOW_LENGTH,
               enable_dueling_network=DUELING,
               delta_clip=DELTA_CLIP,
               )

dqn.compile(Adam(lr=LEARNING_RATE),
            metrics=['mae'])

#dqn.load_weights('checkpoint-1.14-8000-.h5f')

# Okay, now it's time to learn something!
dqn.fit(env,
        nb_steps=TRAINING_STEPS,
        visualize=True,
        verbose=2,
        callbacks=[cp, tb_callback])

# After training is done, we save the final weights.
#dqn.save_weights('dqn_{}_weights.h5f'.format('mario'), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True, action_repetition=ACTION_REPETITION)
예제 #16
0
                   enable_dueling_network=False,
                   batch_size=batch_size,
                   train_interval=4,
                   delta_clip=1.)

import tensorflow as tf


def get_optimizer():
    if use_rnn:
        return Adam(1e-4)
    else:
        return Adam(1e-4)


dqn.compile(get_optimizer(), metrics=['mae'])
'''
if args.transfer_encoding:
    print("Transferring weights")
    if not args.weights:
        raise ValueError("If --transfer_encoding is used, weight file must be provided")
    model_file = args.model
    from keras.models import load_model
    # TODO: correct?
    old_model = load_model(weight_file)
    old_conv_layers = filter(lambda l: l.__class__.__name__ == 'Conv2D',old_model.layers)
    new_conv_layers = filter(lambda l:l.__class__.__name__ == 'Conv2D',model.layers)
    for old_l,new_l in zip(old_conv_layers,new_conv_layers):
        new_l.set_weights(old_l.get_weights())
        new_l.trainable = False # freeze the new layer
'''
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!

# train_policy = BoltzmannQPolicy()
train_policy = EpsGreedyQPolicy(eps=1.0)
test_policy = GreedyQPolicy()

# Compile the agent based on method specified. We use .upper() to convert to 
# upper case for comparison
if METHOD.upper() == 'DUEL_DQN': 
    memory = SequentialMemory(limit=NUM_STEPS, window_length=1)
    agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100,
               enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, 
               policy=train_policy, test_policy=test_policy)
    agent.compile(Adam(lr=1e-3, clipnorm=1.0), metrics=['mae'])

elif METHOD.upper() == 'DQN':
    memory = SequentialMemory(limit=NUM_STEPS, window_length=1)
    agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100,
               target_model_update=1e-2, policy=train_policy, test_policy=test_policy)
    agent.compile(Adam(lr=1e-3, clipnorm=1.0), metrics=['mae'])

elif METHOD.upper() == 'SARSA':
     # SARSA does not require a memory.
    agent = SarsaAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=train_policy)
    agent.compile(Adam(lr=1e-3, clipnorm=1.0), metrics=['mae'])
    
elif METHOD.upper() == 'CEM':
    memory = EpisodeParameterMemory(limit=1000, window_length=1)
    agent = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
예제 #18
0
def main():
    """
    Parses command line arguments, sets training environment parameters, creates deep Q-network and trains it
    on gym environment.
    """
    parser = argparse.ArgumentParser(
        description="Simulation of drivers' behavior")
    parser.add_argument(
        '-f',
        '--fleet',
        help=
        'Fleet sizes to simulate, formatted as comma-separated list (i.e. "-f 250,275,300")'
    )
    parser.add_argument(
        '-m',
        '--multiplier',
        help=
        'Surge multiplier, formatted as comma-separated list (i.e. "-m 1,1.5,2")'
    )
    parser.add_argument('-b', '--bonus', type=int, help='Bonus')
    parser.add_argument('-d', '--demand', help='Percent false demand ')
    parser.add_argument(
        '-k',
        '--know',
        help=
        'Percent knowing fare, formatted as comma-separated list (i.e. "-m 1,1.5,2") '
    )
    parser.add_argument(
        '-p',
        '--pro',
        help=
        'Percent pro drivers, formatted as comma-separated list (i.e. "-p 1,1.5,2") '
    )
    parser.add_argument(
        '-av',
        '--av',
        help=
        'Percent AV drivers, formatted as comma-separated list (i.e. "-av 1,1.5,2") '
    )
    parser.add_argument('-nb', '--nb', help='number of steps to train Rl ')

    args = parser.parse_args()
    if args.fleet:
        fleet_sizes = [int(x) for x in args.fleet.split(',')]
    #        fleet_sizes = args.fleet
    else:
        fleet_sizes = FLEET_SIZE

    if args.multiplier:
        # surge = args.multiplier
        surges = [float(x) for x in args.multiplier.split(',')]
    else:
        surges = [SURGE_MULTIPLIER]

    if args.know:
        # surge = args.multiplier
        perc_know = [float(x) for x in args.know.split(',')]
    else:
        perc_know = [PERCE_KNOW]

    if args.bonus:
        bonus = args.bonus
    else:
        bonus = BONUS

    if args.pro:

        pro_share = [float(x) for x in args.pro.split(',')]
    else:
        pro_share = [PRO_SHARE]

    if args.demand:
        percent_false_demand = float(args.demand)
    else:
        percent_false_demand = PERCENT_FALSE_DEMAND

    if args.av:
        av_share = [float(x) for x in args.av.split(',')]
    else:
        av_share = [1]
    if args.nb:
        nb_steps = args.nb
    else:
        nb_steps = 300

    for fleet_size in fleet_sizes:
        for surge in surges:
            for perc_k in perc_know:
                for pro_s in pro_share:
                    m = Model(ZONE_IDS,
                              DEMAND_SOURCE,
                              WARMUP_TIME_HOUR,
                              ANALYSIS_TIME_HOUR,
                              fleet_size=fleet_size,
                              pro_share=pro_s,
                              surge_multiplier=surge,
                              bonus=bonus,
                              percent_false_demand=percent_false_demand,
                              percentage_know_fare=perc_k)

                    # make one veh to be AV
                    veh = m.vehicles[-1]
                    veh.is_AV = True
                    #
                    env = RebalancingEnv(m, penalty=-0)

                    nb_actions = env.action_space.n
                    input_shape = (1, ) + env.state.shape
                    input_dim = env.input_dim

                    model = Sequential()
                    model.add(Flatten(input_shape=input_shape))
                    model.add(Dense(256, activation='relu'))
                    model.add(Dense(nb_actions, activation='linear'))

                    memory = SequentialMemory(limit=2000, window_length=1)
                    policy = EpsGreedyQPolicy()
                    dqn = DQNAgent(model=model,
                                   nb_actions=nb_actions,
                                   memory=memory,
                                   nb_steps_warmup=100,
                                   target_model_update=1e-2,
                                   policy=policy,
                                   gamma=.99)
                    dqn.compile(Adam(lr=0.001, epsilon=0.05, decay=0.0),
                                metrics=['mae'])

                    dqn.fit(env,
                            nb_steps=nb_steps,
                            action_repetition=1,
                            visualize=False,
                            verbose=2)
                    dqn.save_weights('new_dqn_weights_%s.h5f' % (nb_steps),
                                     overwrite=True)
예제 #19
0
# veh.is_AV = True
# 
# env = RebalancingEnv(m, penalty=-10, config=config )
env = RebalancingEnv(penalty=-10, config=config )
nb_actions = env.action_space.n
input_shape = (1,) + env.state.shape
input_dim = env.input_dim
model = Sequential()
model.add(Flatten(input_shape=input_shape))
model.add(Dense(256, activation='relu'))
model.add(Dense(nb_actions, activation='linear'))
memory = SequentialMemory(limit=2000, window_length=1)
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100,
                target_model_update=1e-2, policy=policy, gamma=0.99)
dqn.compile(Adam(lr=0.001, epsilon=0.05, decay=0.0), metrics=['mae'])

# history = dqn.fit(env, nb_steps=100, action_repetition=1, visualize=False, verbose=2)

# dqn.save_weights('dqn_weights_%s.h5f' % (100), overwrite=True)

dqn.load_weights('dqn_weights_%s.h5f' % (3000))




# for perc_av in percent_av:


perc_av = 1 
print('Fleet size is {f}'.format(f=fleet_size))
예제 #20
0
class Agent(object):
    name = 'DQN'

    def __init__(self,
                 number_of_training_steps=1e5,
                 gamma=0.999,
                 load_weights=False,
                 visualize=False,
                 dueling_network=True,
                 double_dqn=True,
                 nn_type='mlp',
                 **kwargs):
        """
        Agent constructor
        :param window_size: int, number of lags to include in observation
        :param max_position: int, maximum number of positions able to be held in inventory
        :param fitting_file: str, file used for z-score fitting
        :param testing_file: str,file used for dqn experiment
        :param env: environment name
        :param seed: int, random seed number
        :param action_repeats: int, number of steps to take in environment between actions
        :param number_of_training_steps: int, number of steps to train agent for
        :param gamma: float, value between 0 and 1 used to discount future DQN returns
        :param format_3d: boolean, format observation as matrix or tensor
        :param train: boolean, train or test agent
        :param load_weights: boolean, import existing weights
        :param z_score: boolean, standardize observation space
        :param visualize: boolean, visualize environment
        :param dueling_network: boolean, use dueling network architecture
        :param double_dqn: boolean, use double DQN for Q-value approximation
        """
        # Agent arguments
        # self.env_name = id
        self.neural_network_type = nn_type
        self.load_weights = load_weights
        self.number_of_training_steps = number_of_training_steps
        self.visualize = visualize

        # Create environment
        self.env = gym.make(**kwargs)
        self.env_name = self.env.env.id

        # Create agent
        # NOTE: 'Keras-RL' uses its own frame-stacker
        self.memory_frame_stack = 1  # Number of frames to stack e.g., 1.
        self.model = self.create_model(name=self.neural_network_type)
        self.memory = SequentialMemory(limit=10000,
                                       window_length=self.memory_frame_stack)
        self.train = self.env.env.training
        self.cwd = os.path.dirname(os.path.realpath(__file__))

        # create the agent
        self.agent = DQNAgent(model=self.model,
                              nb_actions=self.env.action_space.n,
                              memory=self.memory,
                              processor=None,
                              nb_steps_warmup=500,
                              enable_dueling_network=dueling_network,
                              dueling_type='avg',
                              enable_double_dqn=double_dqn,
                              gamma=gamma,
                              target_model_update=1000,
                              delta_clip=1.0)
        self.agent.compile(Adam(lr=float("3e-4")), metrics=['mae'])

    def __str__(self):
        # msg = '\n'
        # return msg.join(['{}={}'.format(k, v) for k, v in self.__dict__.items()])
        return 'Agent = {} | env = {} | number_of_training_steps = {}'.format(
            Agent.name, self.env_name, self.number_of_training_steps)

    def create_model(self, name: str = 'cnn') -> Sequential:
        """
        Helper function get create and get the default MLP or CNN model.

        :param name: Neural network type ['mlp' or 'cnn']
        :return: neural network
        """
        LOGGER.info("creating model for {}".format(name))
        if name == 'cnn':
            return self._create_cnn_model()
        elif name == 'mlp':
            return self._create_mlp_model()

    def _create_cnn_model(self) -> Sequential:
        """
        Create a Convolutional neural network with dense layer at the end.

        :return: keras model
        """
        features_shape = (self.memory_frame_stack,
                          *self.env.observation_space.shape)
        model = Sequential()
        conv = Conv2D
        model.add(
            conv(input_shape=features_shape,
                 filters=5,
                 kernel_size=[10, 1],
                 padding='same',
                 activation='relu',
                 strides=[5, 1],
                 data_format='channels_first'))
        model.add(
            conv(filters=5,
                 kernel_size=[5, 1],
                 padding='same',
                 activation='relu',
                 strides=[2, 1],
                 data_format='channels_first'))
        model.add(
            conv(filters=5,
                 kernel_size=[4, 1],
                 padding='same',
                 activation='relu',
                 strides=[2, 1],
                 data_format='channels_first'))
        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        model.add(Dense(self.env.action_space.n, activation='softmax'))
        LOGGER.info(model.summary())
        return model

    def _create_mlp_model(self) -> Sequential:
        """
        Create a DENSE neural network with dense layer at the end

        :return: keras model
        """
        features_shape = (self.memory_frame_stack,
                          *self.env.observation_space.shape)
        model = Sequential()
        model.add(
            Dense(units=256, input_shape=features_shape, activation='relu'))
        model.add(Dense(units=256, activation='relu'))
        model.add(Flatten())
        model.add(Dense(self.env.action_space.n, activation='softmax'))
        LOGGER.info(model.summary())
        return model

    def start(self) -> None:
        """
        Entry point for agent training and testing

        :return: (void)
        """
        output_directory = os.path.join(self.cwd, 'dqn_weights')
        if not os.path.exists(output_directory):
            LOGGER.info('{} does not exist. Creating Directory.'.format(
                output_directory))
            os.mkdir(output_directory)

        weight_name = 'dqn_{}_{}_weights.h5f'.format(self.env_name,
                                                     self.neural_network_type)
        weights_filename = os.path.join(output_directory, weight_name)
        LOGGER.info("weights_filename: {}".format(weights_filename))

        if self.load_weights:
            LOGGER.info('...loading weights for {} from\n{}'.format(
                self.env_name, weights_filename))
            self.agent.load_weights(weights_filename)

        if self.train:
            step_chkpt = '{step}.h5f'
            step_chkpt = 'dqn_{}_weights_{}'.format(self.env_name, step_chkpt)
            checkpoint_weights_filename = os.path.join(self.cwd, 'dqn_weights',
                                                       step_chkpt)
            LOGGER.info("checkpoint_weights_filename: {}".format(
                checkpoint_weights_filename))
            log_filename = os.path.join(
                self.cwd, 'dqn_weights',
                'dqn_{}_log.json'.format(self.env_name))
            LOGGER.info('log_filename: {}'.format(log_filename))

            callbacks = [
                ModelIntervalCheckpoint(checkpoint_weights_filename,
                                        interval=250000)
            ]
            callbacks += [FileLogger(log_filename, interval=100)]

            LOGGER.info('Starting training...')
            self.agent.fit(self.env,
                           callbacks=callbacks,
                           nb_steps=self.number_of_training_steps,
                           log_interval=10000,
                           verbose=0,
                           visualize=self.visualize)
            LOGGER.info("training over.")
            LOGGER.info('Saving AGENT weights...')
            self.agent.save_weights(weights_filename, overwrite=True)
            LOGGER.info("AGENT weights saved.")
        else:
            LOGGER.info('Starting TEST...')
            self.agent.test(self.env, nb_episodes=2, visualize=self.visualize)
예제 #21
0
def main(shape=10, winsize=4, test=False, num_max_test=200):
    INPUT_SHAPE = (shape, shape)
    WINDOW_LENGTH = winsize

    class SnakeProcessor(Processor):
        def process_observation(self, observation):
            # assert observation.ndim == 1, str(observation.shape)  # (height, width, channel)
            assert observation.shape == INPUT_SHAPE
            return observation.astype(
                'uint8')  # saves storage in experience memory

        def process_state_batch(self, batch):
            # We could perform this processing step in `process_observation`. In this case, however,
            # we would need to store a `float32` array instead, which is 4x more memory intensive than
            # an `uint8` array. This matters if we store 1M observations.
            processed_batch = batch.astype('float32') / 255.
            return processed_batch

        def process_reward(self, reward):
            return reward

    env = gym.make('snakenv-v0')
    np.random.seed(123)
    env.seed(123)

    input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE
    model = make_model(input_shape, 5)

    memory = SequentialMemory(limit=100000, window_length=WINDOW_LENGTH)
    processor = SnakeProcessor()

    # policy = LinearAnnealedPolicy(
    #     EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1,
    #     value_test=0, nb_steps=500000)
    policy = BoltzmannQPolicy()

    interval = 20000

    dqn = DQNAgent(model=model,
                   nb_actions=5,
                   policy=policy,
                   memory=memory,
                   processor=processor,
                   nb_steps_warmup=20000,
                   gamma=.99,
                   target_model_update=interval,
                   train_interval=4,
                   delta_clip=1.)

    dqn.compile(Adam(lr=0.0005), metrics=['mae'])
    weights_filename = 'dqn_snake_weights.h5f'

    if not test:
        # Okay, now it's time to learn something! We capture the interrupt exception so that training
        # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks!
        weights_filename = 'dqn_{}_weights.h5f'.format('snake')
        checkpoint_weights_filename = 'dqn_' + 'snake' + '_weights_{step}.h5f'
        log_filename = 'dqn_{}_log.json'.format('snake')
        callbacks = [
            ModelIntervalCheckpoint(checkpoint_weights_filename,
                                    interval=interval)
        ]
        callbacks += [
            ModelIntervalCheckpoint(weights_filename, interval=interval)
        ]
        callbacks += [FileLogger(log_filename, interval=500)]
        dqn.fit(env,
                callbacks=callbacks,
                nb_steps=10000000,
                log_interval=10000,
                visualize=False)

        # After training is done, we save the final weights one more time.
        # dqn.save_weights(weights_filename, overwrite=True)

        # Finally, evaluate our algorithm for 10 episodes.
        # dqn.test(env, nb_episodes=10, visualize=True, nb_max_episode_steps=100)
    else:
        while True:
            try:
                dqn.load_weights(weights_filename)
            except Exception:
                print("weights not found, waiting")
            dqn.test(env,
                     nb_episodes=3,
                     visualize=True,
                     nb_max_episode_steps=num_max_test)
            time.sleep(5)
예제 #22
0
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

env = "CartPole-v0"
env = gym.make(env)
np.random.seed(123)
env.seed(123)
n_action_space = env.action_space.n

model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(16, ))
model.add(Activation('relu'))
model.add(Dense(n_action_space))
model.add(Activation('linear'))
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dql = DQNAgent(model=model,
               memory=memory,
               nb_actions=n_action_space,
               nb_steps_warmup=10,
               target_model_update=0.01,
               policy=policy)

dql.compile(Adam(lr=0.001), metrics=['mae'])
dql.fit(env, nb_steps=1000, visualize=True, verbose=True)
dql.test(env, nb_episodes=105, visualize=True)
예제 #23
0
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(
    Dense(nb_actions, activation='softmax', kernel_initializer=he_normal()))

print(model.summary())

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from keras.optimizers import Adam

memory = SequentialMemory(limit=3000, window_length=window_length)
policy = BoltzmannQPolicy()
agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory)
#nb_steps_warmup=10, target_model_update=1e-2, policy=policy, enable_double_dqn=True)
agent.compile(Adam())

# fit の結果を取得しておく
history = agent.fit(env, nb_steps=50_000, visualize=False, verbose=1)

agent.test(env, nb_episodes=80000, visualize=True)
예제 #24
0
        summary_ops_v2.graph(K.get_graph(), step=0)
    writer.close()

    agent = DQNAgent(
        model=model,
        nb_actions=n_actions,
        policy=policy,
        memory=memory,
        nb_steps_warmup=args.warmup_steps,
        gamma=.99,
        target_model_update=args.target_model_update,
        train_interval=args.train_interval,
        delta_clip=1.,
        enable_dueling_network=True)

    agent.compile(Adam(lr=args.learning_rate), metrics=['mae'])

    if args.load_weights_from is not None:
        print(f"Loading Weights From: {args.load_weights_from}")
        weights_filename = f'{args.load_weights_from}/' + 'dqn_{}_weights.h5f'.format(env_name)
        agent.load_weights(weights_filename)

    if args.mode == 'train':
        import os
        current_directory = os.getcwd()
        model_weight_dir = os.path.join(current_directory, MODEL_NAME)
        if not os.path.exists(model_weight_dir):
            os.makedirs(model_weight_dir)

        weights_filename = f'{MODEL_NAME}/dqn_{env_name}_weights.h5f'
        checkpoint_weights_filename = f'{MODEL_NAME}/dqn_' + env_name + '_weights_{step}.h5f'
예제 #25
0
class DeepQTrading:

    #Class constructor
    #model: Keras model considered
    #explorations_iterations: a vector containing (i) probability of random predictions; (ii) how many iterations will be
    #run by the algorithm (we run the algorithm several times-several iterations)
    #outputFile: name of the file to print metrics of the training
    #ensembleFolderName: name of the file to print predictions
    #optimizer: optimizer to run

    def __init__(self,
                 model,
                 nbActions,
                 explorations_iterations,
                 outputFile,
                 ensembleFolderName,
                 optimizer="adamax"):

        self.ensembleFolderName = ensembleFolderName
        self.policy = EpsGreedyQPolicy()
        self.explorations_iterations = explorations_iterations
        self.nbActions = nbActions
        self.model = model
        #Define the memory
        self.memory = SequentialMemory(limit=10000, window_length=1)
        #Instantiate the agent with parameters received
        self.agent = DQNAgent(model=self.model,
                              policy=self.policy,
                              nb_actions=self.nbActions,
                              memory=self.memory,
                              nb_steps_warmup=200,
                              target_model_update=1e-1,
                              enable_double_dqn=True,
                              enable_dueling_network=True)

        #Compile the agent with the optimizer given as parameter
        if optimizer == "adamax":
            self.agent.compile(Adamax(), metrics=['mae'])
        if optimizer == "adadelta":
            self.agent.compile(Adadelta(), metrics=['mae'])
        if optimizer == "sgd":
            self.agent.compile(SGD(), metrics=['mae'])
        if optimizer == "rmsprop":
            self.agent.compile(RMSprop(), metrics=['mae'])
        if optimizer == "nadam":
            self.agent.compile(Nadam(), metrics=['mae'])
        if optimizer == "adagrad":
            self.agent.compile(Adagrad(), metrics=['mae'])
        if optimizer == "adam":
            self.agent.compile(Adam(), metrics=['mae'])
        if optimizer == "radam":
            self.agent.compile(RAdam(total_steps=5000,
                                     warmup_proportion=0.1,
                                     min_lr=1e-5),
                               metrics=['mae'])

        #Save the weights of the agents in the q.weights file
        #Save random weights
        self.agent.save_weights("q.weights", overwrite=True)

        #Load data
        self.train_data = pd.read_csv('./dataset/jpm/train_data.csv')
        self.validation_data = pd.read_csv('./dataset/jpm/train_data.csv')
        self.test_data = pd.read_csv('./dataset/jpm/test_data.csv')

        #Call the callback for training, validation and test in order to show results for each iteration
        self.trainer = ValidationCallback()
        self.validator = ValidationCallback()
        self.tester = ValidationCallback()
        self.outputFileName = outputFile

    def run(self):
        #Initiates the environments,
        trainEnv = validEnv = testEnv = " "

        if not os.path.exists(self.outputFileName):
            os.makedirs(self.outputFileName)

        file_name = self.outputFileName + "/results-agent-training.csv"

        self.outputFile = open(file_name, "w+")
        #write the first row of the csv
        self.outputFile.write("Iteration," + "trainAccuracy," +
                              "trainCoverage," + "trainReward," +
                              "trainLong%," + "trainShort%," +
                              "trainLongAcc," + "trainShortAcc," +
                              "trainLongPrec," + "trainShortPrec," +
                              "validationAccuracy," + "validationCoverage," +
                              "validationReward," + "validationLong%," +
                              "validationShort%," + "validationLongAcc," +
                              "validationShortAcc," + "validLongPrec," +
                              "validShortPrec," + "testAccuracy," +
                              "testCoverage," + "testReward," + "testLong%," +
                              "testShort%," + "testLongAcc," +
                              "testShortAcc," + "testLongPrec," +
                              "testShortPrec\n")

        #Prepare the training and validation files for saving them later
        ensambleValid = pd.DataFrame(
            index=self.validation_data[:].ix[:, 'date_time'].drop_duplicates(
            ).tolist())
        ensambleTest = pd.DataFrame(
            index=self.test_data[:].ix[:,
                                       'date_time'].drop_duplicates().tolist())

        #Put the name of the index for validation and testing
        ensambleValid.index.name = 'date_time'
        ensambleTest.index.name = 'date_time'

        #Explorations are epochs considered, or how many times the agent will play the game.
        for eps in self.explorations_iterations:

            #policy will use eps[0] (explorations), so the randomness of predictions (actions) will happen with eps[0] of probability
            self.policy.eps = eps[0]

            #there will be 25 iterations or eps[1] in explorations_iterations)
            for i in range(0, eps[1]):

                del (trainEnv)
                #Define the training, validation and testing environments with their respective callbacks
                trainEnv = SpEnv(data=self.train_data, callback=self.trainer)

                del (validEnv)
                validEnv = SpEnv(data=self.validation_data,
                                 ensamble=ensambleValid,
                                 callback=self.validator,
                                 columnName="iteration" + str(i))

                del (testEnv)
                testEnv = SpEnv(data=self.test_data,
                                callback=self.tester,
                                ensamble=ensambleTest,
                                columnName="iteration" + str(i))

                #Reset the callback
                self.trainer.reset()
                self.validator.reset()
                self.tester.reset()

                #Reset the training environment
                trainEnv.resetEnv()

                #Train the agent
                #The agent receives as input one environment
                self.agent.fit(trainEnv,
                               nb_steps=len(self.train_data),
                               visualize=False,
                               verbose=0)

                #Get the info from the train callback
                (_, trainCoverage, trainAccuracy, trainReward, trainLongPerc,
                 trainShortPerc, trainLongAcc, trainShortAcc, trainLongPrec,
                 trainShortPrec) = self.trainer.getInfo()

                print("Iteration " + str(i + 1) + " TRAIN:  accuracy: " +
                      str(trainAccuracy) + " coverage: " + str(trainCoverage) +
                      " reward: " + str(trainReward))

                #Reset the validation environment
                validEnv.resetEnv()
                #Test the agent on validation data
                self.agent.test(validEnv,
                                nb_episodes=len(self.validation_data),
                                visualize=False,
                                verbose=0)

                #Get the info from the validation callback
                (_, validCoverage, validAccuracy, validReward, validLongPerc,
                 validShortPerc, validLongAcc, validShortAcc, validLongPrec,
                 validShortPrec) = self.validator.getInfo()
                #Print callback values on the screen
                print("Iteration " + str(i + 1) + " VALIDATION:  accuracy: " +
                      str(validAccuracy) + " coverage: " + str(validCoverage) +
                      " reward: " + str(validReward))

                #Reset the testing environment
                testEnv.resetEnv()
                #Test the agent on testing data
                self.agent.test(testEnv,
                                nb_episodes=len(self.test_data),
                                visualize=False,
                                verbose=0)
                #Get the info from the testing callback
                (_, testCoverage, testAccuracy, testReward, testLongPerc,
                 testShortPerc, testLongAcc, testShortAcc, testLongPrec,
                 testShortPrec) = self.tester.getInfo()
                #Print callback values on the screen
                print("Iteration " + str(i + 1) + " TEST:  acc: " +
                      str(testAccuracy) + " cov: " + str(testCoverage) +
                      " rew: " + str(testReward))
                print(" ")

                #write the metrics in a text file
                self.outputFile.write(
                    str(i) + "," + str(trainAccuracy) + "," +
                    str(trainCoverage) + "," + str(trainReward) + "," +
                    str(trainLongPerc) + "," + str(trainShortPerc) + "," +
                    str(trainLongAcc) + "," + str(trainShortAcc) + "," +
                    str(trainLongPrec) + "," + str(trainShortPrec) + "," +
                    str(validAccuracy) + "," + str(validCoverage) + "," +
                    str(validReward) + "," + str(validLongPerc) + "," +
                    str(validShortPerc) + "," + str(validLongAcc) + "," +
                    str(validShortAcc) + "," + str(validLongPrec) + "," +
                    str(validShortPrec) + "," + str(testAccuracy) + "," +
                    str(testCoverage) + "," + str(testReward) + "," +
                    str(testLongPerc) + "," + str(testShortPerc) + "," +
                    str(testLongAcc) + "," + str(testShortAcc) + "," +
                    str(testLongPrec) + "," + str(testShortPrec) + "\n")

        #Close the file
        self.outputFile.close()

        if not os.path.exists("./Output/ensemble/" + self.ensembleFolderName):
            os.makedirs("./Output/ensemble/" + self.ensembleFolderName)

        ensambleValid.to_csv("./Output/ensemble/" + self.ensembleFolderName +
                             "/ensemble_valid.csv")
        ensambleTest.to_csv("./Output/ensemble/" + self.ensembleFolderName +
                            "/ensemble_test.csv")

    #Function to end the Agent
    def end(self):
        print("FINISHED")
def training_game():
    env = Environment()

    input_shape = (FLAGS.screen_size, FLAGS.screen_size, 1)
    nb_actions = 12  # Number of actions

    model = neural_network_model(input_shape, nb_actions)
    memory = SequentialMemory(limit=3500, window_length=_WINDOW_LENGTH)

    processor = SC2Proc()

    # Policy

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr="eps",
                                  value_max=1,
                                  value_min=0.7,
                                  value_test=.0,
                                  nb_steps=GLOBAL_STEPS)

    # Agent

    dqn = DQNAgent(model=model,
                   nb_actions=nb_actions,
                   memory=memory,
                   enable_double_dqn=False,
                   nb_steps_warmup=GLOBAL_STEPS_WARMUP,
                   target_model_update=1e-2,
                   policy=policy,
                   batch_size=150,
                   processor=processor)

    dqn.compile(Adam(lr=.001), metrics=["mae"])

    # Tensorboard callback

    callbacks = keras.callbacks.TensorBoard(log_dir='./Graph',
                                            histogram_freq=0,
                                            write_graph=True,
                                            write_images=False)

    # Save the parameters and upload them when needed

    name = FLAGS.mini_game
    w_file = "dqn_{}_weights.h5f".format(name)
    check_w_file = "train_w" + name + "_weights.h5f"

    if SAVE_MODEL:
        check_w_file = "train_w" + name + "_weights_{step}.h5f"

    log_file = "training_w_{}_log.json".format(name)

    if LOAD_MODEL:
        dqn.load_weights(w_file)

    #dqn.fit(env, callbacks=callbacks, nb_steps=GLOBAL_STEPS, action_repetition=2, log_interval=1e4, verbose=2)
    dqn.fit(env,
            nb_steps=GLOBAL_STEPS,
            action_repetition=2,
            log_interval=1000,
            verbose=2)

    dqn.save_weights(w_file, overwrite=True)
    dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
예제 #27
0
model.add(layers.Flatten())
for i in range(3):
    model.add(layers.Dense(1024, swish))
model.add(layers.Dense(nb_actions))
print(model.summary())

env.render(mode='cv2')
cv2.waitKey(3000)
cv2.destroyAllWindows()

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               memory=SequentialMemory(limit=50000, window_length=1),
               target_model_update=1e-2,
               policy=BoltzmannQPolicy())
dqn.compile(Adam(1e-3), metrics=['mse', 'mae', 'logcosh'])

dqn.fit(env, nb_steps=10000000, visualize=False, verbose=2)

dqn.save_weights(f'dqn_snake_weights.h5f', overwrite=True)

env.draw()
env.render(mode='cv2')
cv2.waitKey(3000)
cv2.destroyAllWindows()

dqn.test(env, nb_episodes=5, visualize=True)

cv2.waitKey(5000)
cv2.destroyAllWindows()
예제 #28
0
merged = Dense(nb_neuron_input, activation='tanh')(merged)
merged = Dense(nb_neuron_output, activation='softmax')(merged)

model = Model(inputs=[inputs], outputs=[merged])
model.summary()
model.compile(Adam(), loss='mean_squared_error')

memory = SequentialMemory(limit=50000, window_length=1)
policy = MaxBoltzmannQPolicy()
dqn = DQNAgent(model=model,
               nb_actions=nb_neuron_output,
               memory=memory,
               nb_steps_warmup=10,
               target_model_update=1e-2,
               policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae', 'accuracy'])

metrics = Metrics(dqn, env)
#fileName = '1D_advanced_Sequential1000_BoltzmannQ_10000steps(7)'
#fileName = '1D_advanced_Sequential1000_EpsGreedyQ_10000steps(7)'
#fileName = '1D_advanced_Sequential1000_MaxBoltzmannQ_10000steps(7)'
#fileName = '1D_advanced_Sequential50000_BoltzmannQPolicy_10000steps(7)'
#fileName = '1D_advanced_Sequential50000_MaxBoltzmannQ_1000000steps(0)'
fileName = '1D__Sequential50000_BoltzmannQ_1000000steps(0)'

dqn.load_weights('./output/' + fileName + '.h5f')
dqn.test(env, nb_episodes=1, visualize=False, callbacks=[metrics])

metrics.export_figs(fileName)

cumulated_reward = metrics.cumulated_reward()
예제 #29
0
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
    nb_steps=1000000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!

dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, window_length=WINDOW_LENGTH, memory=memory,
    processor=processor, nb_steps_warmup=50000, gamma=.99, delta_range=(-1., 1.), reward_range=(-1., 1.),
    target_model_update=10000, train_interval=4)
dqn.compile(Adam(lr=.00025), metrics=['mae'])

if args.mode == 'train':
    # Okay, now it's time to learn something! We capture the interrupt exception so that training
    # can be prematurely aborted. Notice that you can the built-in Keras callbacks!
    weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name)
    checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f'
    log_filename = 'dqn_{}_log.json'.format(args.env_name)
    callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)]
    callbacks += [FileLogger(log_filename, interval=100)]
    dqn.fit(env, callbacks=callbacks, nb_steps=1750000, log_interval=10000)

    # After training is done, we save the final weights one more time.
    dqn.save_weights(weights_filename, overwrite=True)

    # Finally, evaluate our algorithm for 10 episodes.
예제 #30
0
def train():
    # Get the environment and extract the number of actions.
    env = gym.make(ENV_NAME)
    np.random.seed(123)
    env.seed(123)
    nb_actions = env.action_space.n

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    # Next, we build a very simple model.
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    model.summary()

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=50000, window_length=1)
    policy = BoltzmannQPolicy()

    # Okay, now it's time to learn something! We visualize the training here for show, but this
    # slows down training quite a lot. You can always safely abort the training prematurely using
    # Ctrl + C.
    if REWARD == "normal":
        dqn_normal = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
                              target_model_update=1e-2, policy=policy)
        dqn_normal.compile(Adam(lr=1e-3), metrics=['mae'])
        history_normal = dqn_normal.fit(env, nb_steps=10000, visualize=False, verbose=2)
        dqn_normal.save_weights(os.path.join(LOG_DIR, 'dqn_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
        dqn_normal.test(env, nb_episodes=10, visualize=False, verbose=2)
        
        pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv"))

    elif REWARD == "noisy":
        if not SMOOTH:
            processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, surrogate=False)
        else:
            processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=False)

        # processor_noisy = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=False)
        dqn_noisy = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
                             target_model_update=1e-2, policy=policy, processor=processor_noisy)
        dqn_noisy.compile(Adam(lr=1e-3), metrics=['mae'])
        history_noisy = dqn_noisy.fit(env, nb_steps=10000, visualize=False, verbose=2)
        if not SMOOTH:
            dqn_noisy.save_weights(os.path.join(LOG_DIR, 'dqn_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
            pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv"))
        else:
            dqn_noisy.save_weights(os.path.join(LOG_DIR, 'dqn_noisy_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
            pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv"))

        dqn_noisy.test(env, nb_episodes=10, visualize=False, verbose=2)
        

    elif REWARD == "surrogate":
        if not SMOOTH:
            processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=True)
        else:
            processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=True)

        # processor_surrogate = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=True)
        dqn_surrogate = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
                                 target_model_update=1e-2, policy=policy, processor=processor_surrogate)
        dqn_surrogate.compile(Adam(lr=1e-3), metrics=['mae'])    
        history_surrogate = dqn_surrogate.fit(env, nb_steps=10000, visualize=False, verbose=2)
        if not SMOOTH:
            dqn_surrogate.save_weights(os.path.join(LOG_DIR, 'dqn_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
            pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv"))
        else:
            dqn_surrogate.save_weights(os.path.join(LOG_DIR, 'dqn_surrogate_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
            pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv"))

        dqn_surrogate.test(env, nb_episodes=10, visualize=False, verbose=2)

    else:
        raise NotImplementedError
예제 #31
0
파일: ddqn.py 프로젝트: zyz8821/FleetSim
class DDQN:
    def __init__(
        self,
        env,
        name,
        memory_limit=10000,
        nb_eps=10000,
        nb_warmup=100,
        dueling=True,
        double=True,
    ):
        # Set fixed seet for the environment
        self.env = env
        self.env.seed(123)
        np.random.seed(123)
        random.seed(123)

        self.name = name
        self.log_filename = "./logs/{}_log.json".format(self.name)
        self.weights_filename = "./results/{}_weights.h5f".format(self.name)
        self.result_filename = "./results/{}_result.csv".format(self.name)

        # Extract the number of actions form the environment
        nb_action = self.env.action_space.spaces[0].n
        nb_actions = nb_action ** len(self.env.action_space.spaces)
        nb_states = self.env.observation_space.shape

        # Next, we build a very simple model.
        model = self._build_nn(nb_states, nb_actions)

        # Next, we define the replay memorey
        memory = SequentialMemory(limit=memory_limit, window_length=1)

        policy = LinearAnnealedPolicy(
            EpsGreedyQPolicy(),
            attr="eps",
            nb_steps=nb_eps,
            value_max=1.0,  # Start with full random
            value_min=0.1,  # After nb_steps arrivate at 10% random
            value_test=0.0,  # (Don't) pick random action when testing
        )

        # Configure and compile our agent:
        # You can use every built-in Keras optimizer and even the metrics!
        self.dqn = DQNAgent(
            model=model,
            nb_actions=nb_actions,
            memory=memory,
            nb_steps_warmup=nb_warmup,
            enable_dueling_network=dueling,  # Enable dueling
            dueling_type="avg",
            enable_double_dqn=double,  # Enable double dqn
            target_model_update=1e-2,
            policy=policy,
        )
        self.dqn.compile(Adam(lr=1e-2), metrics=["mae"])

    def _build_nn(self, nb_states, nb_actions):
        model = Sequential()
        model.add(Flatten(input_shape=(1,) + nb_states))
        model.add(Dense(16))
        model.add(Activation("relu"))
        model.add(Dense(16))
        model.add(Activation("relu"))
        model.add(Dense(16))
        model.add(Activation("relu"))
        model.add(Dense(nb_actions))
        model.add(Activation("linear"))
        return model

    def run(self, steps):
        callbacks = [FileLogger(self.log_filename)]
        self.dqn.fit(
            self.env,
            callbacks=callbacks,
            nb_steps=steps,
            visualize=False,
            verbose=1,
            log_interval=10000,
        )
        # After training is done, we save the final weights.
        self.dqn.save_weights(self.weights_filename, overwrite=True)

    def test(self):
        self.dqn.load_weights(self.weights_filename)
        self.dqn.test(self.env, nb_episodes=1, visualize=False)
        self.env.save_results(self.result_filename)
#!/usr/bin/env python3
from PIL import Image
import gym
from rl.agents.dqn import DQNAgent
from rl.memory import SequentialMemory
import tensorflow.keras as K

INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4

build_model = __import__('train').build_model
AtariProcessor = __import__('train').AtariProcessor

if __name__ == '__main__':
    env = gym.make("Breakout-v0")
    env.reset()
    num_actions = env.action_space.n
    model = build_model(num_action)  # deep conv net
    memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
    processor = AtariProcessor()
    dqn = DQNAgent(model=model,
                   nb_actions=num_actions,
                   processor=processor,
                   memory=memory)
    dqn.compile(K.optimizers.Adam(lr=.00025), metrics=['mae'])

    # load weights.
    dqn.load_weights('policy.h5')

    # evaluate algorithm for 10 episodes.
    dqn.test(env, nb_episodes=10, visualize=True)
예제 #33
0
logbook().record_hyperparameter('Memory Type', str(type(memory)))
logbook().record_hyperparameter('Memory Limit', memory.limit)
logbook().record_hyperparameter('Memory Window Length', memory.window_length)
logbook().record_hyperparameter('nb_steps_warmup', dqn.nb_steps_warmup) #info on this parameter here: https://datascience.stackexchange.com/questions/46056/in-keras-library-what-is-the-meaning-of-nb-steps-warmup-in-the-dqnagent-objec
logbook().record_hyperparameter('target_model_update', dqn.target_model_update) #info on this parameter here: https://github.com/keras-rl/keras-rl/issues/55
logbook().record_hyperparameter('nb_actions', nb_actions)
logbook().record_hyperparameter('batch_size', dqn.batch_size) #defaults to 32. Info here: https://radiopaedia.org/articles/batch-size-machine-learning
logbook().record_hyperparameter('gamma', dqn.gamma) #defaults to 0.99. 'Discount rate' according to Advanced Deep Learning with Keras


# dqn.compile(Adam(lr=1e-3), metrics=['mae'])
# Needs general tuning, usually model-specific - https://machinelearningmastery.com/learning-rate-for-deep-learning-neural-networks/
# learning_rate = 1e-6
# learning_rate = 1e-3
learning_rate = 1e-1
dqn.compile(Adam(lr=learning_rate), metrics=['mae'])
logbook().record_hyperparameter('Learning Rate', learning_rate)

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
# dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)
# nb_steps = 500000
# nb_steps = 50000
# nb_steps = 25000
nb_steps = 5000
# nb_steps = 50
dqn.fit(env, nb_steps=nb_steps, visualize=False, verbose=2)
logbook().record_hyperparameter('nb_steps', nb_steps)

# After training is done, we save the final weights.
예제 #34
0
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               processor=processor,
               nb_steps_warmup=50000,
               gamma=.99,
               target_model_update=10000,
               train_interval=4,
               delta_clip=1.)

dqn.compile(Adam(lr=.00025))

if args.mode == 'train' and args.weights != None:
    dqn.load_weights(args.weights)
    print('Model weights from file {} successfully charged'.format(
        args.weights))

date = datetime.now().strftime("%Y-%m-%d")
if args.mode == 'train':
    # Okay, now it's time to learn something! We capture the interrupt exception so that training
    # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks!
    weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name)
    checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f'
    log_filename = 'dqn_{}_log.json'.format(args.env_name)
    callbacks = [
        ModelIntervalCheckpoint(checkpoint_weights_filename,
예제 #35
0
    )

    # Defining our DQN
    dqn = DQNAgent(
        model=model,
        nb_actions=18,
        policy=policy,
        memory=memory,
        nb_steps_warmup=1000,
        gamma=0.5,
        target_model_update=1,
        delta_clip=0.01,
        enable_double_dqn=True,
    )

    dqn.compile(Adam(lr=0.00025), metrics=["mae"])

    # Training
    env_player.play_against(
        env_algorithm=dqn_training,
        opponent=opponent,
        env_algorithm_kwargs={
            "dqn": dqn,
            "nb_steps": NB_TRAINING_STEPS
        },
    )
    model.save("model_%d" % NB_TRAINING_STEPS)

    # Evaluation
    print("Results against random player:")
    env_player.play_against(
예제 #36
0
def main():
    try:
        # env = gym.make("AirRaid-v0")
        # env = gym.make("slitherio-v0")
        env = gym.make("slitherio-v0", headless=False, width=500, height=500)

        model_callbacks = [
            ModelIntervalCheckpoint(SAVED_MODEL_NAME,
                                    interval=MODEL_SAVE_STEP_INTERVAL,
                                    verbose=0)
        ]

        # model = conv_model(env)
        # model = lstm_conv_model(env)
        model = full_combined_conv_lstm_model(env)
        model.load_weights(SAVED_MODEL_NAME)
        # model = enhanced_conv_lstm_model(env)
        # print(model.summary())
        major_rounds = int(NSTEPS / 1000)
        max_total_eps = 1.0
        min_total_eps = 0.1
        eps_range = max_total_eps - min_total_eps
        eps_step = eps_range / major_rounds

        for major_step in range(major_rounds):
            print("Major step", major_step, "of", major_rounds)

            max_eps = max_total_eps - eps_step * major_step
            min_eps = max_eps - eps_step

            policy = LinearAnnealedPolicy(
                EpsGreedyQPolicy(eps=0.1),
                attr="eps",
                value_max=max_eps,
                value_min=min_eps,
                value_test=0.1,
                nb_steps=1000,
            )
            memory = SequentialMemory(limit=DQN_MEMORY_SIZE, window_length=1)
            dqn = DQNAgent(
                model=model,
                nb_actions=env.action_space.n,
                memory=memory,
                target_model_update=1e-2,
                policy=policy,
                enable_double_dqn=True,
                processor=LSTMProcessor(),
            )
            dqn.compile(Adam(lr=1e-3), metrics=["mae"])
            dqn.fit(
                env,
                nb_steps=1000,
                visualize=False,
                verbose=1,
                callbacks=model_callbacks,
                log_interval=1000,  # TODO bruh this fixes the 10000 issue!
            )

        env.reset()
        dqn.test(env, nb_episodes=5, visualize=True)

        env.close()

    except WebDriverException as e:
        print(e)

    except Exception as e:
        env.close()
        print(e)
nb_actions = env.action_space.n

# Next, we build a neural network model
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(3, input_dim=1,activation= 'tanh'))
model.add(Dense(nb_actions))
model.add(Activation('sigmoid')

policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=-1, value_test=.05,
                              nb_steps=1000000)

memory = SequentialMemory(limit=10000000, window_length=1)
dqn2 = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50,
              target_model_update=1e-2, policy=policy, enable_double_dqn=True, enable_dueling_network=False)
dqn2.compile(Adam(lr=1e-3), metrics=['mae', 'acc'])

import os.path

file_path = 'Double_DQN_Taxi.h5f'
if os.path.exists(file_path):
    dqn2.load_weights(file_path)


class Saver(Callback):
    def on_episode_end(self, episode, logs={}):
        print('episode callback')
        if episode % 1 == 0:
            self.model.save_weights('Double_DQN_Taxi.h5f', overwrite=True)

예제 #38
0
파일: test.py 프로젝트: NibuTake/Qlearning
from keras.optimizers import Adam
import gym
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

env = gym.make('MountainCar-v0')
nb_actions = env.action_space.n

model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))

memory = SequentialMemory(limit=30000, window_length=1)

policy = EpsGreedyQPolicy(eps=0.001)
dqn = DQNAgent(model=model, nb_actions=nb_actions,gamma=0.99, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

history = dqn.fit(env, nb_steps=30000, visualize=False, verbose=2)

dqn.test(env, nb_episodes=1, visualize=True)
예제 #39
0
    # If you want, you can experiment with the parameters or use a different policy. Another popular one
    # is Boltzmann-style exploration:
    #policy = BoltzmannQPolicy(tau=1.)
    # Feel free to give it a try!

    dqn = DQNAgent(model=model,
                   nb_actions=nb_actions,
                   policy=policy,
                   memory=memory,
                   processor=processor,
                   nb_steps_warmup=50000,
                   gamma=.99,
                   target_model_update=10000,
                   train_interval=4,
                   delta_clip=1.)
    dqn.compile(Adam(lr=.00025), metrics=['mae'])
    agents.append(dqn)

mdqn = IndieMultiAgent(agents)
if args.mode == 'train':
    # Okay, now it's time to learn something! We capture the interrupt exception so that training
    # can be prematurely aborted. Notice that you can the built-in Keras callbacks!
    weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name)
    checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f'
    log_filename = 'dqn_{}_log.json'.format(args.env_name)
    callbacks = [
        ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)
    ]
    tensorboard = TensorBoard(log_dir="logs/{}_ESP_Greedy_{}".format(
        args.env_name, strftime("%Y-%m-%d %H:%M:%S", gmtime())))
    callbacks += [FileLogger(log_filename, interval=100), tensorboard]