def main(): # Create env np.random.seed(SEED) env = PentagoEnv(SIZE, agent_starts = AGENT_STARTS) env.seed(SEED) nb_actions = env.action_space.n # Define model model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(64, activation='relu')) model.add(Dense(128, activation='sigmoid')) model.add(Dense(nb_actions)) print(model.summary()) # Configure and compile agent memory = SequentialMemory(limit=5000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000, target_model_update=1000, policy=policy) optimizer=RMSprop(lr=0.00025, epsilon=0.01) dqn.compile(optimizer) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=True, verbose=1) # After training is done, we save the final weights. dqn.save_weights('weights/dqn-{}-weights-{}.h5f'.format(TAG, datetime.datetime.now()))
def main(): np.random.seed(123) env = PentagoEnv(SIZE) env.seed(123) nb_actions = env.action_space.n model = Sequential() #model.add(Reshape((SIZE ** 2,), input_shape=(SIZE, SIZE))) model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(64, activation='relu')) model.add(Dense(128, activation='sigmoid')) model.add(Dense(nb_actions)) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=5000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000, target_model_update=1e-2, policy=policy) optimizer=RMSprop(lr=0.00025, epsilon=0.01) dqn.compile(optimizer) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=True, verbose=1) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
def test_single_dqn_input(): model = Sequential() model.add(Flatten(input_shape=(2, 3))) model.add(Dense(2)) memory = SequentialMemory(limit=10, window_length=2) for double_dqn in (True, False): agent = DQNAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, enable_double_dqn=double_dqn) agent.compile('sgd') agent.fit(MultiInputTestEnv((3,)), nb_steps=10)
class DQN(BaseAgent): def __init__(self, model, processor, policy, test_policy, num_actions): # Replay memory memory = SequentialMemory(limit=opt.dqn_replay_memory_size, window_length=opt.dqn_window_length) self.agent = DQNAgent(model=model, nb_actions=num_actions, policy=policy, test_policy=test_policy, memory=memory, processor=processor, batch_size=opt.dqn_batch_size, nb_steps_warmup=opt.dqn_nb_steps_warmup, gamma=opt.dqn_gamma, target_model_update=opt.dqn_target_model_update, enable_double_dqn=opt.enable_double_dqn, enable_dueling_network=opt.enable_dueling_network, train_interval=opt.dqn_train_interval, delta_clip=opt.dqn_delta_clip) self.agent.compile(optimizer=keras.optimizers.Adam(lr=opt.dqn_learning_rate), metrics=['mae']) def fit(self, env, num_steps, weights_path=None, visualize=False): callbacks = [] if weights_path is not None: callbacks += [ModelIntervalCheckpoint(weights_path, interval=50000, verbose=1)] self.agent.fit(env=env, nb_steps=num_steps, action_repetition=opt.dqn_action_repetition, callbacks=callbacks, log_interval=opt.log_interval, test_interval=opt.test_interval, test_nb_episodes=opt.test_nb_episodes, test_action_repetition=opt.dqn_action_repetition, visualize=visualize, test_visualize=visualize, verbose=1) def test(self, env, num_episodes, visualize=False): self.agent.test(env=env, nb_episodes=num_episodes, action_repetition=opt.dqn_action_repetition, verbose=2, visualize=visualize) def save(self, out_dir): self.agent.save_weights(out_dir, overwrite=True) def load(self, out_dir): self.agent.load_weights(out_dir)
def test_multi_dqn_input(): input1 = Input(shape=(2, 3)) input2 = Input(shape=(2, 4)) x = Concatenate()([input1, input2]) x = Flatten()(x) x = Dense(2)(x) model = Model(inputs=[input1, input2], outputs=x) memory = SequentialMemory(limit=10, window_length=2) processor = MultiInputProcessor(nb_inputs=2) for double_dqn in (True, False): agent = DQNAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, processor=processor, enable_double_dqn=double_dqn) agent.compile('sgd') agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10)
def train_dqn_model(layers, rounds=10000, run_test=False, use_score=False): ENV_NAME = 'malware-score-v0' if use_score else 'malware-v0' env = gym.make(ENV_NAME) env.seed(123) nb_actions = env.action_space.n window_length = 1 # "experience" consists of where we were, where we are now # generate a policy model model = generate_dense_model((window_length,) + env.observation_space.shape, layers, nb_actions) # configure and compile our agent # BoltzmannQPolicy selects an action stochastically with a probability generated by soft-maxing Q values policy = BoltzmannQPolicy() # memory can help a model during training # for this, we only consider a single malware sample (window_length=1) for each "experience" memory = SequentialMemory(limit=32, ignore_episode_boundaries=False, window_length=window_length) # DQN agent as described in Mnih (2013) and Mnih (2015). # http://arxiv.org/pdf/1312.5602.pdf # http://arxiv.org/abs/1509.06461 agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=16, enable_double_dqn=True, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy, batch_size=16) # keras-rl allows one to use and built-in keras optimizer agent.compile(RMSprop(lr=1e-3), metrics=['mae']) # play the game. learn something! agent.fit(env, nb_steps=rounds, visualize=False, verbose=2) history_train = env.history history_test = None if run_test: # Set up the testing environment TEST_NAME = 'malware-score-test-v0' if use_score else 'malware-test-v0' test_env = gym.make(TEST_NAME) # evaluate the agent on a few episodes, drawing randomly from the test samples agent.test(test_env, nb_episodes=100, visualize=False) history_test = test_env.history return agent, model, history_train, history_test
ENV_NAME = 'CartPole-v0' # Get the environment and extract the number of actions available in the Cartpole problem env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n print(env.observation_space.shape) model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) policy = EpsGreedyQPolicy() memory = SequentialMemory(limit=50000, window_length=1) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. dqn.fit(env, nb_steps=5000, visualize=True) dqn.test(env, nb_episodes=5, visualize=True)
# 各ステップごと順番に学習させるわけではく、一度メモリに保存してからランダムに抽出と学習するとか # 正直、完全には理解できていません memory = SequentialMemory(limit=40000, window_length=1) # 行動方策はオーソドックスなepsilon-greedyです。 policy = EpsGreedyQPolicy(eps=0.1) # warmup = 文字通り準備運動のイメージ いきなり学習させずにある程度メモリに貯めると思ってる # update = 学習率 小さくすると時間がかかるし、高くすると過学習しやすくなる dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=0.001)) # nb_steps = 何ステップ学習させるか 数値をめちゃくちゃ大きくして、一晩経ったらCtrl+Cで止めるとかでも別にいい # max_episode_steps = 1エピソードの最大ステップ history = dqn.fit(env, nb_steps=400000, visualize=False, verbose=2, nb_max_episode_steps=1440) # modelとweightの保存 now = datetime.now().strftime("%Y%m%d%H%M%S") dqn.save_weights('weight_' + str(now) + '.h5') model_json = model.to_json() with open('model_' + str(now) + '.json', "w") as json_file: json_file.write(model_json)
# agent dqn = DQNAgent(model=expert_model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, enable_double_dqn=True, enable_dueling_network=True, gamma=.99, target_model_update=10000, train_interval=1, delta_clip=1., nb_steps_warmup=50000) lr = .00025 dqn.compile(Adam(lr), metrics=['mae']) weights_filename = model_saves + filename_append + "_" + datestr + "_" + 'expert_' + environment_name + '_weights.h5f' checkpoint_weights_filename = model_saves + filename_append + "_" + datestr + "_" + 'expert_' + environment_name + '_weights{step}.h5f' log_filename = model_saves + filename_append + "_" + datestr + "_" + 'expert_' + environment_name + '_REWARD_DATA.txt' callbacks = [ TrainEpisodeLogger(log_filename), ModelIntervalCheckpoint(checkpoint_weights_filename, interval=1000000) ] if args.mode == 'train': dqn.fit(env, callbacks=callbacks, nb_steps=4250000, verbose=0, nb_max_episode_steps=1500) dqn.save_weights(weights_filename, overwrite=True)
def main(): env = PikaEnv() nb_actions = env.action_space.n model = Sequential() model.add(Flatten(input_shape=(4, ) + env.observation_space.shape)) model.add(Dense(512)) model.add(Activation("relu")) model.add(Dense(512)) model.add(Activation("relu")) model.add(Dense(512)) model.add(Activation("relu")) model.add(Dense(nb_actions)) model.add(Activation("linear")) print(model.summary()) memory = SequentialMemory(limit=1_000_000, window_length=4) policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.05, value_test=0.05, nb_steps=nb_steps // 4, ) dqn = DQNAgent( model=model, nb_actions=nb_actions, policy=policy, memory=memory, enable_dueling_network=True, enable_double_dqn=True, ) dqn.compile(Adam(lr=0.00025), metrics=["mae"]) # dqn.load_weights(log_dir + "load.h5f") weights_filename = log_dir + "dqn_weights.h5f" checkpoint_weights_filename = log_dir + "dqn_weights_{step}.h5f" log_filename = log_dir + "dqn_log.json" callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=10000) ] callbacks += [FileLogger(log_filename, interval=100)] tbCallBack = TensorBoard( log_dir=tb_dir, histogram_freq=0, write_graph=True, write_grads=True, write_images=True, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, ) callbacks += [tbCallBack] dqn.fit( env, callbacks=callbacks, nb_steps=nb_steps, log_interval=10, visualize=True, verbose=2, ) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True)
conc = concatenate([model_phase_encoded, model_vehicle_encoded]) hidden = Dense(128)(conc) hidden = LeakyReLU()(hidden) hidden = Dense(64)(hidden) hidden = LeakyReLU()(hidden) output = Dense(nb_actions, activation='linear')(hidden) model = Model(inputs=[model_phase_input, model_vehicle_input], outputs=output) model_path = "dqn_model.h5" try: model.load_weights(model_path) print(f"Success loading previous weights at {model_path}") except BaseException as e: print(f"Did not load previous weights due to {e}, {model_path}") ### Policy, Memory & Agent set-up. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.01, value_test=.01, nb_steps=100000) memory = SequentialMemory(limit=50000, window_length=1) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, policy=policy, batch_size=64, gamma=.95, nb_steps_warmup=2000, target_model_update=.001) dqn.processor = MultiInputProcessor(2) dqn.compile(optimizer=Adam(lr=.001)) ### Fit. hist = dqn.fit(env, nb_steps=200, verbose=1, log_interval=10) dqn.save_weights(model_path, overwrite=True) print("Saved model to disk") test_env = CityFlowAgent(mode='predict', config_path=config_path) start_time = default_timer() dqn.test(test_env, nb_episodes=1, visualize=False) print(f"\n Done testing inn {default_timer()-start_time} seconds")
· Millor respecte greedy perque no considera d'igual manera les opcions considerades no optimes · D'aquesta manera s'ignoren accions sub-optimes Testejats diferents Learning rates i agafat el més optim """ memory = SequentialMemory(limit=1000000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=0.0015), metrics=['mae']) """ Entrenament per 150000 steps """ a = dqn.fit(env, nb_steps=150000, visualize=False, verbose=2) """ Carregar pesos, cuidado que et carregues l'entrenament """ weights_filename = 'dqn64_LunarLander-v2_weights.h5f'.format('LunarLander-v2') dqn.load_weights(weights_filename) """ Test per 20 epochs """ dqn.test(env, nb_episodes=20, visualize=False)
def training_game(): env = Environment( map_name="ForceField", visualize=True, game_steps_per_episode=150, agent_interface_format=features.AgentInterfaceFormat( feature_dimensions=features.Dimensions(screen=64, minimap=32))) input_shape = (_SIZE, _SIZE, 1) nb_actions = 12 # Number of actions model = neural_network_model(input_shape, nb_actions) memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH) processor = SC2Proc() # Policy policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=1, value_min=0.2, value_test=.0, nb_steps=1e2) # Agent dqn = DQNAgent( model=model, nb_actions=nb_actions, memory=memory, enable_double_dqn=True, enable_dueling_network=True, # 2019-07-12 GU Zhan (Sam) when value shape problem, reduce nb_steps_warmup: # nb_steps_warmup=300, target_model_update=1e-2, policy=policy, nb_steps_warmup=500, target_model_update=1e-2, policy=policy, batch_size=150, processor=processor, delta_clip=1) dqn.compile(Adam(lr=.001), metrics=["mae", "acc"]) # Tensorboard callback timestamp = f"{datetime.datetime.now():%Y-%m-%d %I:%M%p}" # 2019-07-12 GU Zhan (Sam) folder name for Lunux: # callbacks = keras.callbacks.TensorBoard(log_dir='./Graph/'+ timestamp, histogram_freq=0, # write_graph=True, write_images=False) # 2019-07-12 GU Zhan (Sam) folder name for Windows: callbacks = keras.callbacks.TensorBoard(log_dir='.\Graph\issgz', histogram_freq=0, write_graph=True, write_images=False) # Save the parameters and upload them when needed name = "agent" w_file = "dqn_{}_weights.h5f".format(name) check_w_file = "train_w" + name + "_weights.h5f" if SAVE_MODEL: check_w_file = "train_w" + name + "_weights_{step}.h5f" log_file = "training_w_{}_log.json".format(name) if LOAD_MODEL: dqn.load_weights(w_file) class Saver(Callback): def on_episode_end(self, episode, logs={}): if episode % 200 == 0: self.model.save_weights(w_file, overwrite=True) s = Saver() logs = FileLogger('DQN_Agent_log.csv', interval=1) dqn.fit(env, callbacks=[callbacks, s, logs], nb_steps=600, action_repetition=2, log_interval=1e4, verbose=2) dqn.save_weights(w_file, overwrite=True) dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
model = keras.layers.Flatten()(model) model = keras.layers.Dense(512, activation='relu')(model) model = keras.layers.Dense(4, activation='linear')(model) model = keras.Model(inputs=input, outputs=model) model.summary() print(model.output) model.output._keras_shape = (None, 4) print(model.output._keras_shape) game = gym.make('Breakout-v0') agent = DQNAgent(model, policy, nb_actions=game.action_space.n, nb_steps_warmup=50000, memory=memory, processor=AtariProcessor(), train_interval=4, delta_clip=1.) agent.compile(keras.optimizers.Adam(lr=.00025), metrics=['mae']) callbacks = [rl.callbacks.ModelIntervalCheckpoint('ckpt.h5f', interval=250000)] callbacks += [FileLogger('log.json', interval=100)] if False: agent.load_weights('weights.h5f') agent.fit(game, nb_steps=1750000, visualize=False, log_interval=10000, callbacks=callbacks) agent.save_weights('weights.h5f', overwrite=True) game.reset() agent.test(game, nb_episodes=10, visualize=True)
value_test=MIN_EPSILON, nb_steps=int(TRAINING_STEPS*EPSILON_DECAY_PERIOD)) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=WARMUP_STEPS, processor=processor, target_model_update=TARGET_MODEL_UPDATE, policy=policy, train_interval=WINDOW_LENGTH, enable_dueling_network=DUELING, delta_clip=DELTA_CLIP, ) dqn.compile(Adam(lr=LEARNING_RATE), metrics=['mae']) #dqn.load_weights('checkpoint-1.14-8000-.h5f') # Okay, now it's time to learn something! dqn.fit(env, nb_steps=TRAINING_STEPS, visualize=True, verbose=2, callbacks=[cp, tb_callback]) # After training is done, we save the final weights. #dqn.save_weights('dqn_{}_weights.h5f'.format('mario'), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=True, action_repetition=ACTION_REPETITION)
enable_dueling_network=False, batch_size=batch_size, train_interval=4, delta_clip=1.) import tensorflow as tf def get_optimizer(): if use_rnn: return Adam(1e-4) else: return Adam(1e-4) dqn.compile(get_optimizer(), metrics=['mae']) ''' if args.transfer_encoding: print("Transferring weights") if not args.weights: raise ValueError("If --transfer_encoding is used, weight file must be provided") model_file = args.model from keras.models import load_model # TODO: correct? old_model = load_model(weight_file) old_conv_layers = filter(lambda l: l.__class__.__name__ == 'Conv2D',old_model.layers) new_conv_layers = filter(lambda l:l.__class__.__name__ == 'Conv2D',model.layers) for old_l,new_l in zip(old_conv_layers,new_conv_layers): new_l.set_weights(old_l.get_weights()) new_l.trainable = False # freeze the new layer '''
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! # train_policy = BoltzmannQPolicy() train_policy = EpsGreedyQPolicy(eps=1.0) test_policy = GreedyQPolicy() # Compile the agent based on method specified. We use .upper() to convert to # upper case for comparison if METHOD.upper() == 'DUEL_DQN': memory = SequentialMemory(limit=NUM_STEPS, window_length=1) agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=train_policy, test_policy=test_policy) agent.compile(Adam(lr=1e-3, clipnorm=1.0), metrics=['mae']) elif METHOD.upper() == 'DQN': memory = SequentialMemory(limit=NUM_STEPS, window_length=1) agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=train_policy, test_policy=test_policy) agent.compile(Adam(lr=1e-3, clipnorm=1.0), metrics=['mae']) elif METHOD.upper() == 'SARSA': # SARSA does not require a memory. agent = SarsaAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=train_policy) agent.compile(Adam(lr=1e-3, clipnorm=1.0), metrics=['mae']) elif METHOD.upper() == 'CEM': memory = EpisodeParameterMemory(limit=1000, window_length=1) agent = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
def main(): """ Parses command line arguments, sets training environment parameters, creates deep Q-network and trains it on gym environment. """ parser = argparse.ArgumentParser( description="Simulation of drivers' behavior") parser.add_argument( '-f', '--fleet', help= 'Fleet sizes to simulate, formatted as comma-separated list (i.e. "-f 250,275,300")' ) parser.add_argument( '-m', '--multiplier', help= 'Surge multiplier, formatted as comma-separated list (i.e. "-m 1,1.5,2")' ) parser.add_argument('-b', '--bonus', type=int, help='Bonus') parser.add_argument('-d', '--demand', help='Percent false demand ') parser.add_argument( '-k', '--know', help= 'Percent knowing fare, formatted as comma-separated list (i.e. "-m 1,1.5,2") ' ) parser.add_argument( '-p', '--pro', help= 'Percent pro drivers, formatted as comma-separated list (i.e. "-p 1,1.5,2") ' ) parser.add_argument( '-av', '--av', help= 'Percent AV drivers, formatted as comma-separated list (i.e. "-av 1,1.5,2") ' ) parser.add_argument('-nb', '--nb', help='number of steps to train Rl ') args = parser.parse_args() if args.fleet: fleet_sizes = [int(x) for x in args.fleet.split(',')] # fleet_sizes = args.fleet else: fleet_sizes = FLEET_SIZE if args.multiplier: # surge = args.multiplier surges = [float(x) for x in args.multiplier.split(',')] else: surges = [SURGE_MULTIPLIER] if args.know: # surge = args.multiplier perc_know = [float(x) for x in args.know.split(',')] else: perc_know = [PERCE_KNOW] if args.bonus: bonus = args.bonus else: bonus = BONUS if args.pro: pro_share = [float(x) for x in args.pro.split(',')] else: pro_share = [PRO_SHARE] if args.demand: percent_false_demand = float(args.demand) else: percent_false_demand = PERCENT_FALSE_DEMAND if args.av: av_share = [float(x) for x in args.av.split(',')] else: av_share = [1] if args.nb: nb_steps = args.nb else: nb_steps = 300 for fleet_size in fleet_sizes: for surge in surges: for perc_k in perc_know: for pro_s in pro_share: m = Model(ZONE_IDS, DEMAND_SOURCE, WARMUP_TIME_HOUR, ANALYSIS_TIME_HOUR, fleet_size=fleet_size, pro_share=pro_s, surge_multiplier=surge, bonus=bonus, percent_false_demand=percent_false_demand, percentage_know_fare=perc_k) # make one veh to be AV veh = m.vehicles[-1] veh.is_AV = True # env = RebalancingEnv(m, penalty=-0) nb_actions = env.action_space.n input_shape = (1, ) + env.state.shape input_dim = env.input_dim model = Sequential() model.add(Flatten(input_shape=input_shape)) model.add(Dense(256, activation='relu')) model.add(Dense(nb_actions, activation='linear')) memory = SequentialMemory(limit=2000, window_length=1) policy = EpsGreedyQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy, gamma=.99) dqn.compile(Adam(lr=0.001, epsilon=0.05, decay=0.0), metrics=['mae']) dqn.fit(env, nb_steps=nb_steps, action_repetition=1, visualize=False, verbose=2) dqn.save_weights('new_dqn_weights_%s.h5f' % (nb_steps), overwrite=True)
# veh.is_AV = True # # env = RebalancingEnv(m, penalty=-10, config=config ) env = RebalancingEnv(penalty=-10, config=config ) nb_actions = env.action_space.n input_shape = (1,) + env.state.shape input_dim = env.input_dim model = Sequential() model.add(Flatten(input_shape=input_shape)) model.add(Dense(256, activation='relu')) model.add(Dense(nb_actions, activation='linear')) memory = SequentialMemory(limit=2000, window_length=1) policy = EpsGreedyQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy, gamma=0.99) dqn.compile(Adam(lr=0.001, epsilon=0.05, decay=0.0), metrics=['mae']) # history = dqn.fit(env, nb_steps=100, action_repetition=1, visualize=False, verbose=2) # dqn.save_weights('dqn_weights_%s.h5f' % (100), overwrite=True) dqn.load_weights('dqn_weights_%s.h5f' % (3000)) # for perc_av in percent_av: perc_av = 1 print('Fleet size is {f}'.format(f=fleet_size))
class Agent(object): name = 'DQN' def __init__(self, number_of_training_steps=1e5, gamma=0.999, load_weights=False, visualize=False, dueling_network=True, double_dqn=True, nn_type='mlp', **kwargs): """ Agent constructor :param window_size: int, number of lags to include in observation :param max_position: int, maximum number of positions able to be held in inventory :param fitting_file: str, file used for z-score fitting :param testing_file: str,file used for dqn experiment :param env: environment name :param seed: int, random seed number :param action_repeats: int, number of steps to take in environment between actions :param number_of_training_steps: int, number of steps to train agent for :param gamma: float, value between 0 and 1 used to discount future DQN returns :param format_3d: boolean, format observation as matrix or tensor :param train: boolean, train or test agent :param load_weights: boolean, import existing weights :param z_score: boolean, standardize observation space :param visualize: boolean, visualize environment :param dueling_network: boolean, use dueling network architecture :param double_dqn: boolean, use double DQN for Q-value approximation """ # Agent arguments # self.env_name = id self.neural_network_type = nn_type self.load_weights = load_weights self.number_of_training_steps = number_of_training_steps self.visualize = visualize # Create environment self.env = gym.make(**kwargs) self.env_name = self.env.env.id # Create agent # NOTE: 'Keras-RL' uses its own frame-stacker self.memory_frame_stack = 1 # Number of frames to stack e.g., 1. self.model = self.create_model(name=self.neural_network_type) self.memory = SequentialMemory(limit=10000, window_length=self.memory_frame_stack) self.train = self.env.env.training self.cwd = os.path.dirname(os.path.realpath(__file__)) # create the agent self.agent = DQNAgent(model=self.model, nb_actions=self.env.action_space.n, memory=self.memory, processor=None, nb_steps_warmup=500, enable_dueling_network=dueling_network, dueling_type='avg', enable_double_dqn=double_dqn, gamma=gamma, target_model_update=1000, delta_clip=1.0) self.agent.compile(Adam(lr=float("3e-4")), metrics=['mae']) def __str__(self): # msg = '\n' # return msg.join(['{}={}'.format(k, v) for k, v in self.__dict__.items()]) return 'Agent = {} | env = {} | number_of_training_steps = {}'.format( Agent.name, self.env_name, self.number_of_training_steps) def create_model(self, name: str = 'cnn') -> Sequential: """ Helper function get create and get the default MLP or CNN model. :param name: Neural network type ['mlp' or 'cnn'] :return: neural network """ LOGGER.info("creating model for {}".format(name)) if name == 'cnn': return self._create_cnn_model() elif name == 'mlp': return self._create_mlp_model() def _create_cnn_model(self) -> Sequential: """ Create a Convolutional neural network with dense layer at the end. :return: keras model """ features_shape = (self.memory_frame_stack, *self.env.observation_space.shape) model = Sequential() conv = Conv2D model.add( conv(input_shape=features_shape, filters=5, kernel_size=[10, 1], padding='same', activation='relu', strides=[5, 1], data_format='channels_first')) model.add( conv(filters=5, kernel_size=[5, 1], padding='same', activation='relu', strides=[2, 1], data_format='channels_first')) model.add( conv(filters=5, kernel_size=[4, 1], padding='same', activation='relu', strides=[2, 1], data_format='channels_first')) model.add(Flatten()) model.add(Dense(256, activation='relu')) model.add(Dense(self.env.action_space.n, activation='softmax')) LOGGER.info(model.summary()) return model def _create_mlp_model(self) -> Sequential: """ Create a DENSE neural network with dense layer at the end :return: keras model """ features_shape = (self.memory_frame_stack, *self.env.observation_space.shape) model = Sequential() model.add( Dense(units=256, input_shape=features_shape, activation='relu')) model.add(Dense(units=256, activation='relu')) model.add(Flatten()) model.add(Dense(self.env.action_space.n, activation='softmax')) LOGGER.info(model.summary()) return model def start(self) -> None: """ Entry point for agent training and testing :return: (void) """ output_directory = os.path.join(self.cwd, 'dqn_weights') if not os.path.exists(output_directory): LOGGER.info('{} does not exist. Creating Directory.'.format( output_directory)) os.mkdir(output_directory) weight_name = 'dqn_{}_{}_weights.h5f'.format(self.env_name, self.neural_network_type) weights_filename = os.path.join(output_directory, weight_name) LOGGER.info("weights_filename: {}".format(weights_filename)) if self.load_weights: LOGGER.info('...loading weights for {} from\n{}'.format( self.env_name, weights_filename)) self.agent.load_weights(weights_filename) if self.train: step_chkpt = '{step}.h5f' step_chkpt = 'dqn_{}_weights_{}'.format(self.env_name, step_chkpt) checkpoint_weights_filename = os.path.join(self.cwd, 'dqn_weights', step_chkpt) LOGGER.info("checkpoint_weights_filename: {}".format( checkpoint_weights_filename)) log_filename = os.path.join( self.cwd, 'dqn_weights', 'dqn_{}_log.json'.format(self.env_name)) LOGGER.info('log_filename: {}'.format(log_filename)) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [FileLogger(log_filename, interval=100)] LOGGER.info('Starting training...') self.agent.fit(self.env, callbacks=callbacks, nb_steps=self.number_of_training_steps, log_interval=10000, verbose=0, visualize=self.visualize) LOGGER.info("training over.") LOGGER.info('Saving AGENT weights...') self.agent.save_weights(weights_filename, overwrite=True) LOGGER.info("AGENT weights saved.") else: LOGGER.info('Starting TEST...') self.agent.test(self.env, nb_episodes=2, visualize=self.visualize)
def main(shape=10, winsize=4, test=False, num_max_test=200): INPUT_SHAPE = (shape, shape) WINDOW_LENGTH = winsize class SnakeProcessor(Processor): def process_observation(self, observation): # assert observation.ndim == 1, str(observation.shape) # (height, width, channel) assert observation.shape == INPUT_SHAPE return observation.astype( 'uint8') # saves storage in experience memory def process_state_batch(self, batch): # We could perform this processing step in `process_observation`. In this case, however, # we would need to store a `float32` array instead, which is 4x more memory intensive than # an `uint8` array. This matters if we store 1M observations. processed_batch = batch.astype('float32') / 255. return processed_batch def process_reward(self, reward): return reward env = gym.make('snakenv-v0') np.random.seed(123) env.seed(123) input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE model = make_model(input_shape, 5) memory = SequentialMemory(limit=100000, window_length=WINDOW_LENGTH) processor = SnakeProcessor() # policy = LinearAnnealedPolicy( # EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, # value_test=0, nb_steps=500000) policy = BoltzmannQPolicy() interval = 20000 dqn = DQNAgent(model=model, nb_actions=5, policy=policy, memory=memory, processor=processor, nb_steps_warmup=20000, gamma=.99, target_model_update=interval, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=0.0005), metrics=['mae']) weights_filename = 'dqn_snake_weights.h5f' if not test: # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks! weights_filename = 'dqn_{}_weights.h5f'.format('snake') checkpoint_weights_filename = 'dqn_' + 'snake' + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format('snake') callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=interval) ] callbacks += [ ModelIntervalCheckpoint(weights_filename, interval=interval) ] callbacks += [FileLogger(log_filename, interval=500)] dqn.fit(env, callbacks=callbacks, nb_steps=10000000, log_interval=10000, visualize=False) # After training is done, we save the final weights one more time. # dqn.save_weights(weights_filename, overwrite=True) # Finally, evaluate our algorithm for 10 episodes. # dqn.test(env, nb_episodes=10, visualize=True, nb_max_episode_steps=100) else: while True: try: dqn.load_weights(weights_filename) except Exception: print("weights not found, waiting") dqn.test(env, nb_episodes=3, visualize=True, nb_max_episode_steps=num_max_test) time.sleep(5)
from keras.layers import Dense, Activation, Flatten from keras.optimizers import Adam from rl.agents.dqn import DQNAgent from rl.policy import EpsGreedyQPolicy from rl.memory import SequentialMemory env = "CartPole-v0" env = gym.make(env) np.random.seed(123) env.seed(123) n_action_space = env.action_space.n model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16, )) model.add(Activation('relu')) model.add(Dense(n_action_space)) model.add(Activation('linear')) policy = EpsGreedyQPolicy() memory = SequentialMemory(limit=50000, window_length=1) dql = DQNAgent(model=model, memory=memory, nb_actions=n_action_space, nb_steps_warmup=10, target_model_update=0.01, policy=policy) dql.compile(Adam(lr=0.001), metrics=['mae']) dql.fit(env, nb_steps=1000, visualize=True, verbose=True) dql.test(env, nb_episodes=105, visualize=True)
model.add(Dense(128, activation='relu')) model.add(Dense(64, activation='relu')) model.add(Dropout(0.3)) model.add(Dense(128, activation='relu')) model.add(Dense(64, activation='relu')) model.add(Dense(32, activation='relu')) model.add(Dropout(0.3)) model.add(Dense(128, activation='relu')) model.add(Dense(64, activation='relu')) model.add(Dense(32, activation='relu')) model.add( Dense(nb_actions, activation='softmax', kernel_initializer=he_normal())) print(model.summary()) from rl.agents.dqn import DQNAgent from rl.policy import BoltzmannQPolicy from rl.memory import SequentialMemory from keras.optimizers import Adam memory = SequentialMemory(limit=3000, window_length=window_length) policy = BoltzmannQPolicy() agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory) #nb_steps_warmup=10, target_model_update=1e-2, policy=policy, enable_double_dqn=True) agent.compile(Adam()) # fit の結果を取得しておく history = agent.fit(env, nb_steps=50_000, visualize=False, verbose=1) agent.test(env, nb_episodes=80000, visualize=True)
summary_ops_v2.graph(K.get_graph(), step=0) writer.close() agent = DQNAgent( model=model, nb_actions=n_actions, policy=policy, memory=memory, nb_steps_warmup=args.warmup_steps, gamma=.99, target_model_update=args.target_model_update, train_interval=args.train_interval, delta_clip=1., enable_dueling_network=True) agent.compile(Adam(lr=args.learning_rate), metrics=['mae']) if args.load_weights_from is not None: print(f"Loading Weights From: {args.load_weights_from}") weights_filename = f'{args.load_weights_from}/' + 'dqn_{}_weights.h5f'.format(env_name) agent.load_weights(weights_filename) if args.mode == 'train': import os current_directory = os.getcwd() model_weight_dir = os.path.join(current_directory, MODEL_NAME) if not os.path.exists(model_weight_dir): os.makedirs(model_weight_dir) weights_filename = f'{MODEL_NAME}/dqn_{env_name}_weights.h5f' checkpoint_weights_filename = f'{MODEL_NAME}/dqn_' + env_name + '_weights_{step}.h5f'
class DeepQTrading: #Class constructor #model: Keras model considered #explorations_iterations: a vector containing (i) probability of random predictions; (ii) how many iterations will be #run by the algorithm (we run the algorithm several times-several iterations) #outputFile: name of the file to print metrics of the training #ensembleFolderName: name of the file to print predictions #optimizer: optimizer to run def __init__(self, model, nbActions, explorations_iterations, outputFile, ensembleFolderName, optimizer="adamax"): self.ensembleFolderName = ensembleFolderName self.policy = EpsGreedyQPolicy() self.explorations_iterations = explorations_iterations self.nbActions = nbActions self.model = model #Define the memory self.memory = SequentialMemory(limit=10000, window_length=1) #Instantiate the agent with parameters received self.agent = DQNAgent(model=self.model, policy=self.policy, nb_actions=self.nbActions, memory=self.memory, nb_steps_warmup=200, target_model_update=1e-1, enable_double_dqn=True, enable_dueling_network=True) #Compile the agent with the optimizer given as parameter if optimizer == "adamax": self.agent.compile(Adamax(), metrics=['mae']) if optimizer == "adadelta": self.agent.compile(Adadelta(), metrics=['mae']) if optimizer == "sgd": self.agent.compile(SGD(), metrics=['mae']) if optimizer == "rmsprop": self.agent.compile(RMSprop(), metrics=['mae']) if optimizer == "nadam": self.agent.compile(Nadam(), metrics=['mae']) if optimizer == "adagrad": self.agent.compile(Adagrad(), metrics=['mae']) if optimizer == "adam": self.agent.compile(Adam(), metrics=['mae']) if optimizer == "radam": self.agent.compile(RAdam(total_steps=5000, warmup_proportion=0.1, min_lr=1e-5), metrics=['mae']) #Save the weights of the agents in the q.weights file #Save random weights self.agent.save_weights("q.weights", overwrite=True) #Load data self.train_data = pd.read_csv('./dataset/jpm/train_data.csv') self.validation_data = pd.read_csv('./dataset/jpm/train_data.csv') self.test_data = pd.read_csv('./dataset/jpm/test_data.csv') #Call the callback for training, validation and test in order to show results for each iteration self.trainer = ValidationCallback() self.validator = ValidationCallback() self.tester = ValidationCallback() self.outputFileName = outputFile def run(self): #Initiates the environments, trainEnv = validEnv = testEnv = " " if not os.path.exists(self.outputFileName): os.makedirs(self.outputFileName) file_name = self.outputFileName + "/results-agent-training.csv" self.outputFile = open(file_name, "w+") #write the first row of the csv self.outputFile.write("Iteration," + "trainAccuracy," + "trainCoverage," + "trainReward," + "trainLong%," + "trainShort%," + "trainLongAcc," + "trainShortAcc," + "trainLongPrec," + "trainShortPrec," + "validationAccuracy," + "validationCoverage," + "validationReward," + "validationLong%," + "validationShort%," + "validationLongAcc," + "validationShortAcc," + "validLongPrec," + "validShortPrec," + "testAccuracy," + "testCoverage," + "testReward," + "testLong%," + "testShort%," + "testLongAcc," + "testShortAcc," + "testLongPrec," + "testShortPrec\n") #Prepare the training and validation files for saving them later ensambleValid = pd.DataFrame( index=self.validation_data[:].ix[:, 'date_time'].drop_duplicates( ).tolist()) ensambleTest = pd.DataFrame( index=self.test_data[:].ix[:, 'date_time'].drop_duplicates().tolist()) #Put the name of the index for validation and testing ensambleValid.index.name = 'date_time' ensambleTest.index.name = 'date_time' #Explorations are epochs considered, or how many times the agent will play the game. for eps in self.explorations_iterations: #policy will use eps[0] (explorations), so the randomness of predictions (actions) will happen with eps[0] of probability self.policy.eps = eps[0] #there will be 25 iterations or eps[1] in explorations_iterations) for i in range(0, eps[1]): del (trainEnv) #Define the training, validation and testing environments with their respective callbacks trainEnv = SpEnv(data=self.train_data, callback=self.trainer) del (validEnv) validEnv = SpEnv(data=self.validation_data, ensamble=ensambleValid, callback=self.validator, columnName="iteration" + str(i)) del (testEnv) testEnv = SpEnv(data=self.test_data, callback=self.tester, ensamble=ensambleTest, columnName="iteration" + str(i)) #Reset the callback self.trainer.reset() self.validator.reset() self.tester.reset() #Reset the training environment trainEnv.resetEnv() #Train the agent #The agent receives as input one environment self.agent.fit(trainEnv, nb_steps=len(self.train_data), visualize=False, verbose=0) #Get the info from the train callback (_, trainCoverage, trainAccuracy, trainReward, trainLongPerc, trainShortPerc, trainLongAcc, trainShortAcc, trainLongPrec, trainShortPrec) = self.trainer.getInfo() print("Iteration " + str(i + 1) + " TRAIN: accuracy: " + str(trainAccuracy) + " coverage: " + str(trainCoverage) + " reward: " + str(trainReward)) #Reset the validation environment validEnv.resetEnv() #Test the agent on validation data self.agent.test(validEnv, nb_episodes=len(self.validation_data), visualize=False, verbose=0) #Get the info from the validation callback (_, validCoverage, validAccuracy, validReward, validLongPerc, validShortPerc, validLongAcc, validShortAcc, validLongPrec, validShortPrec) = self.validator.getInfo() #Print callback values on the screen print("Iteration " + str(i + 1) + " VALIDATION: accuracy: " + str(validAccuracy) + " coverage: " + str(validCoverage) + " reward: " + str(validReward)) #Reset the testing environment testEnv.resetEnv() #Test the agent on testing data self.agent.test(testEnv, nb_episodes=len(self.test_data), visualize=False, verbose=0) #Get the info from the testing callback (_, testCoverage, testAccuracy, testReward, testLongPerc, testShortPerc, testLongAcc, testShortAcc, testLongPrec, testShortPrec) = self.tester.getInfo() #Print callback values on the screen print("Iteration " + str(i + 1) + " TEST: acc: " + str(testAccuracy) + " cov: " + str(testCoverage) + " rew: " + str(testReward)) print(" ") #write the metrics in a text file self.outputFile.write( str(i) + "," + str(trainAccuracy) + "," + str(trainCoverage) + "," + str(trainReward) + "," + str(trainLongPerc) + "," + str(trainShortPerc) + "," + str(trainLongAcc) + "," + str(trainShortAcc) + "," + str(trainLongPrec) + "," + str(trainShortPrec) + "," + str(validAccuracy) + "," + str(validCoverage) + "," + str(validReward) + "," + str(validLongPerc) + "," + str(validShortPerc) + "," + str(validLongAcc) + "," + str(validShortAcc) + "," + str(validLongPrec) + "," + str(validShortPrec) + "," + str(testAccuracy) + "," + str(testCoverage) + "," + str(testReward) + "," + str(testLongPerc) + "," + str(testShortPerc) + "," + str(testLongAcc) + "," + str(testShortAcc) + "," + str(testLongPrec) + "," + str(testShortPrec) + "\n") #Close the file self.outputFile.close() if not os.path.exists("./Output/ensemble/" + self.ensembleFolderName): os.makedirs("./Output/ensemble/" + self.ensembleFolderName) ensambleValid.to_csv("./Output/ensemble/" + self.ensembleFolderName + "/ensemble_valid.csv") ensambleTest.to_csv("./Output/ensemble/" + self.ensembleFolderName + "/ensemble_test.csv") #Function to end the Agent def end(self): print("FINISHED")
def training_game(): env = Environment() input_shape = (FLAGS.screen_size, FLAGS.screen_size, 1) nb_actions = 12 # Number of actions model = neural_network_model(input_shape, nb_actions) memory = SequentialMemory(limit=3500, window_length=_WINDOW_LENGTH) processor = SC2Proc() # Policy policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=1, value_min=0.7, value_test=.0, nb_steps=GLOBAL_STEPS) # Agent dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, enable_double_dqn=False, nb_steps_warmup=GLOBAL_STEPS_WARMUP, target_model_update=1e-2, policy=policy, batch_size=150, processor=processor) dqn.compile(Adam(lr=.001), metrics=["mae"]) # Tensorboard callback callbacks = keras.callbacks.TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=False) # Save the parameters and upload them when needed name = FLAGS.mini_game w_file = "dqn_{}_weights.h5f".format(name) check_w_file = "train_w" + name + "_weights.h5f" if SAVE_MODEL: check_w_file = "train_w" + name + "_weights_{step}.h5f" log_file = "training_w_{}_log.json".format(name) if LOAD_MODEL: dqn.load_weights(w_file) #dqn.fit(env, callbacks=callbacks, nb_steps=GLOBAL_STEPS, action_repetition=2, log_interval=1e4, verbose=2) dqn.fit(env, nb_steps=GLOBAL_STEPS, action_repetition=2, log_interval=1000, verbose=2) dqn.save_weights(w_file, overwrite=True) dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
model.add(layers.Flatten()) for i in range(3): model.add(layers.Dense(1024, swish)) model.add(layers.Dense(nb_actions)) print(model.summary()) env.render(mode='cv2') cv2.waitKey(3000) cv2.destroyAllWindows() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=SequentialMemory(limit=50000, window_length=1), target_model_update=1e-2, policy=BoltzmannQPolicy()) dqn.compile(Adam(1e-3), metrics=['mse', 'mae', 'logcosh']) dqn.fit(env, nb_steps=10000000, visualize=False, verbose=2) dqn.save_weights(f'dqn_snake_weights.h5f', overwrite=True) env.draw() env.render(mode='cv2') cv2.waitKey(3000) cv2.destroyAllWindows() dqn.test(env, nb_episodes=5, visualize=True) cv2.waitKey(5000) cv2.destroyAllWindows()
merged = Dense(nb_neuron_input, activation='tanh')(merged) merged = Dense(nb_neuron_output, activation='softmax')(merged) model = Model(inputs=[inputs], outputs=[merged]) model.summary() model.compile(Adam(), loss='mean_squared_error') memory = SequentialMemory(limit=50000, window_length=1) policy = MaxBoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_neuron_output, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae', 'accuracy']) metrics = Metrics(dqn, env) #fileName = '1D_advanced_Sequential1000_BoltzmannQ_10000steps(7)' #fileName = '1D_advanced_Sequential1000_EpsGreedyQ_10000steps(7)' #fileName = '1D_advanced_Sequential1000_MaxBoltzmannQ_10000steps(7)' #fileName = '1D_advanced_Sequential50000_BoltzmannQPolicy_10000steps(7)' #fileName = '1D_advanced_Sequential50000_MaxBoltzmannQ_1000000steps(0)' fileName = '1D__Sequential50000_BoltzmannQ_1000000steps(0)' dqn.load_weights('./output/' + fileName + '.h5f') dqn.test(env, nb_episodes=1, visualize=False, callbacks=[metrics]) metrics.export_figs(fileName) cumulated_reward = metrics.cumulated_reward()
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, window_length=WINDOW_LENGTH, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, delta_range=(-1., 1.), reward_range=(-1., 1.), target_model_update=10000, train_interval=4) dqn.compile(Adam(lr=.00025), metrics=['mae']) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that you can the built-in Keras callbacks! weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(args.env_name) callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)] callbacks += [FileLogger(log_filename, interval=100)] dqn.fit(env, callbacks=callbacks, nb_steps=1750000, log_interval=10000) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True) # Finally, evaluate our algorithm for 10 episodes.
def train(): # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) model.summary() # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. if REWARD == "normal": dqn_normal = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn_normal.compile(Adam(lr=1e-3), metrics=['mae']) history_normal = dqn_normal.fit(env, nb_steps=10000, visualize=False, verbose=2) dqn_normal.save_weights(os.path.join(LOG_DIR, 'dqn_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) dqn_normal.test(env, nb_episodes=10, visualize=False, verbose=2) pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv")) elif REWARD == "noisy": if not SMOOTH: processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, surrogate=False) else: processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=False) # processor_noisy = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=False) dqn_noisy = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy, processor=processor_noisy) dqn_noisy.compile(Adam(lr=1e-3), metrics=['mae']) history_noisy = dqn_noisy.fit(env, nb_steps=10000, visualize=False, verbose=2) if not SMOOTH: dqn_noisy.save_weights(os.path.join(LOG_DIR, 'dqn_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv")) else: dqn_noisy.save_weights(os.path.join(LOG_DIR, 'dqn_noisy_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv")) dqn_noisy.test(env, nb_episodes=10, visualize=False, verbose=2) elif REWARD == "surrogate": if not SMOOTH: processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=True) else: processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=True) # processor_surrogate = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=True) dqn_surrogate = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy, processor=processor_surrogate) dqn_surrogate.compile(Adam(lr=1e-3), metrics=['mae']) history_surrogate = dqn_surrogate.fit(env, nb_steps=10000, visualize=False, verbose=2) if not SMOOTH: dqn_surrogate.save_weights(os.path.join(LOG_DIR, 'dqn_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv")) else: dqn_surrogate.save_weights(os.path.join(LOG_DIR, 'dqn_surrogate_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv")) dqn_surrogate.test(env, nb_episodes=10, visualize=False, verbose=2) else: raise NotImplementedError
class DDQN: def __init__( self, env, name, memory_limit=10000, nb_eps=10000, nb_warmup=100, dueling=True, double=True, ): # Set fixed seet for the environment self.env = env self.env.seed(123) np.random.seed(123) random.seed(123) self.name = name self.log_filename = "./logs/{}_log.json".format(self.name) self.weights_filename = "./results/{}_weights.h5f".format(self.name) self.result_filename = "./results/{}_result.csv".format(self.name) # Extract the number of actions form the environment nb_action = self.env.action_space.spaces[0].n nb_actions = nb_action ** len(self.env.action_space.spaces) nb_states = self.env.observation_space.shape # Next, we build a very simple model. model = self._build_nn(nb_states, nb_actions) # Next, we define the replay memorey memory = SequentialMemory(limit=memory_limit, window_length=1) policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr="eps", nb_steps=nb_eps, value_max=1.0, # Start with full random value_min=0.1, # After nb_steps arrivate at 10% random value_test=0.0, # (Don't) pick random action when testing ) # Configure and compile our agent: # You can use every built-in Keras optimizer and even the metrics! self.dqn = DQNAgent( model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_warmup, enable_dueling_network=dueling, # Enable dueling dueling_type="avg", enable_double_dqn=double, # Enable double dqn target_model_update=1e-2, policy=policy, ) self.dqn.compile(Adam(lr=1e-2), metrics=["mae"]) def _build_nn(self, nb_states, nb_actions): model = Sequential() model.add(Flatten(input_shape=(1,) + nb_states)) model.add(Dense(16)) model.add(Activation("relu")) model.add(Dense(16)) model.add(Activation("relu")) model.add(Dense(16)) model.add(Activation("relu")) model.add(Dense(nb_actions)) model.add(Activation("linear")) return model def run(self, steps): callbacks = [FileLogger(self.log_filename)] self.dqn.fit( self.env, callbacks=callbacks, nb_steps=steps, visualize=False, verbose=1, log_interval=10000, ) # After training is done, we save the final weights. self.dqn.save_weights(self.weights_filename, overwrite=True) def test(self): self.dqn.load_weights(self.weights_filename) self.dqn.test(self.env, nb_episodes=1, visualize=False) self.env.save_results(self.result_filename)
#!/usr/bin/env python3 from PIL import Image import gym from rl.agents.dqn import DQNAgent from rl.memory import SequentialMemory import tensorflow.keras as K INPUT_SHAPE = (84, 84) WINDOW_LENGTH = 4 build_model = __import__('train').build_model AtariProcessor = __import__('train').AtariProcessor if __name__ == '__main__': env = gym.make("Breakout-v0") env.reset() num_actions = env.action_space.n model = build_model(num_action) # deep conv net memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) processor = AtariProcessor() dqn = DQNAgent(model=model, nb_actions=num_actions, processor=processor, memory=memory) dqn.compile(K.optimizers.Adam(lr=.00025), metrics=['mae']) # load weights. dqn.load_weights('policy.h5') # evaluate algorithm for 10 episodes. dqn.test(env, nb_episodes=10, visualize=True)
logbook().record_hyperparameter('Memory Type', str(type(memory))) logbook().record_hyperparameter('Memory Limit', memory.limit) logbook().record_hyperparameter('Memory Window Length', memory.window_length) logbook().record_hyperparameter('nb_steps_warmup', dqn.nb_steps_warmup) #info on this parameter here: https://datascience.stackexchange.com/questions/46056/in-keras-library-what-is-the-meaning-of-nb-steps-warmup-in-the-dqnagent-objec logbook().record_hyperparameter('target_model_update', dqn.target_model_update) #info on this parameter here: https://github.com/keras-rl/keras-rl/issues/55 logbook().record_hyperparameter('nb_actions', nb_actions) logbook().record_hyperparameter('batch_size', dqn.batch_size) #defaults to 32. Info here: https://radiopaedia.org/articles/batch-size-machine-learning logbook().record_hyperparameter('gamma', dqn.gamma) #defaults to 0.99. 'Discount rate' according to Advanced Deep Learning with Keras # dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Needs general tuning, usually model-specific - https://machinelearningmastery.com/learning-rate-for-deep-learning-neural-networks/ # learning_rate = 1e-6 # learning_rate = 1e-3 learning_rate = 1e-1 dqn.compile(Adam(lr=learning_rate), metrics=['mae']) logbook().record_hyperparameter('Learning Rate', learning_rate) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. # dqn.fit(env, nb_steps=50000, visualize=False, verbose=2) # nb_steps = 500000 # nb_steps = 50000 # nb_steps = 25000 nb_steps = 5000 # nb_steps = 50 dqn.fit(env, nb_steps=nb_steps, visualize=False, verbose=2) logbook().record_hyperparameter('nb_steps', nb_steps) # After training is done, we save the final weights.
# is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025)) if args.mode == 'train' and args.weights != None: dqn.load_weights(args.weights) print('Model weights from file {} successfully charged'.format( args.weights)) date = datetime.now().strftime("%Y-%m-%d") if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks! weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(args.env_name) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename,
) # Defining our DQN dqn = DQNAgent( model=model, nb_actions=18, policy=policy, memory=memory, nb_steps_warmup=1000, gamma=0.5, target_model_update=1, delta_clip=0.01, enable_double_dqn=True, ) dqn.compile(Adam(lr=0.00025), metrics=["mae"]) # Training env_player.play_against( env_algorithm=dqn_training, opponent=opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_steps": NB_TRAINING_STEPS }, ) model.save("model_%d" % NB_TRAINING_STEPS) # Evaluation print("Results against random player:") env_player.play_against(
def main(): try: # env = gym.make("AirRaid-v0") # env = gym.make("slitherio-v0") env = gym.make("slitherio-v0", headless=False, width=500, height=500) model_callbacks = [ ModelIntervalCheckpoint(SAVED_MODEL_NAME, interval=MODEL_SAVE_STEP_INTERVAL, verbose=0) ] # model = conv_model(env) # model = lstm_conv_model(env) model = full_combined_conv_lstm_model(env) model.load_weights(SAVED_MODEL_NAME) # model = enhanced_conv_lstm_model(env) # print(model.summary()) major_rounds = int(NSTEPS / 1000) max_total_eps = 1.0 min_total_eps = 0.1 eps_range = max_total_eps - min_total_eps eps_step = eps_range / major_rounds for major_step in range(major_rounds): print("Major step", major_step, "of", major_rounds) max_eps = max_total_eps - eps_step * major_step min_eps = max_eps - eps_step policy = LinearAnnealedPolicy( EpsGreedyQPolicy(eps=0.1), attr="eps", value_max=max_eps, value_min=min_eps, value_test=0.1, nb_steps=1000, ) memory = SequentialMemory(limit=DQN_MEMORY_SIZE, window_length=1) dqn = DQNAgent( model=model, nb_actions=env.action_space.n, memory=memory, target_model_update=1e-2, policy=policy, enable_double_dqn=True, processor=LSTMProcessor(), ) dqn.compile(Adam(lr=1e-3), metrics=["mae"]) dqn.fit( env, nb_steps=1000, visualize=False, verbose=1, callbacks=model_callbacks, log_interval=1000, # TODO bruh this fixes the 10000 issue! ) env.reset() dqn.test(env, nb_episodes=5, visualize=True) env.close() except WebDriverException as e: print(e) except Exception as e: env.close() print(e)
nb_actions = env.action_space.n # Next, we build a neural network model model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(3, input_dim=1,activation= 'tanh')) model.add(Dense(nb_actions)) model.add(Activation('sigmoid') policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=-1, value_test=.05, nb_steps=1000000) memory = SequentialMemory(limit=10000000, window_length=1) dqn2 = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50, target_model_update=1e-2, policy=policy, enable_double_dqn=True, enable_dueling_network=False) dqn2.compile(Adam(lr=1e-3), metrics=['mae', 'acc']) import os.path file_path = 'Double_DQN_Taxi.h5f' if os.path.exists(file_path): dqn2.load_weights(file_path) class Saver(Callback): def on_episode_end(self, episode, logs={}): print('episode callback') if episode % 1 == 0: self.model.save_weights('Double_DQN_Taxi.h5f', overwrite=True)
from keras.optimizers import Adam import gym from rl.agents.dqn import DQNAgent from rl.policy import EpsGreedyQPolicy from rl.memory import SequentialMemory env = gym.make('MountainCar-v0') nb_actions = env.action_space.n model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) memory = SequentialMemory(limit=30000, window_length=1) policy = EpsGreedyQPolicy(eps=0.001) dqn = DQNAgent(model=model, nb_actions=nb_actions,gamma=0.99, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) history = dqn.fit(env, nb_steps=30000, visualize=False, verbose=2) dqn.test(env, nb_episodes=1, visualize=True)
# If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: #policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) agents.append(dqn) mdqn = IndieMultiAgent(agents) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that you can the built-in Keras callbacks! weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(args.env_name) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] tensorboard = TensorBoard(log_dir="logs/{}_ESP_Greedy_{}".format( args.env_name, strftime("%Y-%m-%d %H:%M:%S", gmtime()))) callbacks += [FileLogger(log_filename, interval=100), tensorboard]