class DQN(BaseAgent): def __init__(self, model, processor, policy, test_policy, num_actions): # Replay memory memory = SequentialMemory(limit=opt.dqn_replay_memory_size, window_length=opt.dqn_window_length) self.agent = DQNAgent(model=model, nb_actions=num_actions, policy=policy, test_policy=test_policy, memory=memory, processor=processor, batch_size=opt.dqn_batch_size, nb_steps_warmup=opt.dqn_nb_steps_warmup, gamma=opt.dqn_gamma, target_model_update=opt.dqn_target_model_update, enable_double_dqn=opt.enable_double_dqn, enable_dueling_network=opt.enable_dueling_network, train_interval=opt.dqn_train_interval, delta_clip=opt.dqn_delta_clip) self.agent.compile(optimizer=keras.optimizers.Adam(lr=opt.dqn_learning_rate), metrics=['mae']) def fit(self, env, num_steps, weights_path=None, visualize=False): callbacks = [] if weights_path is not None: callbacks += [ModelIntervalCheckpoint(weights_path, interval=50000, verbose=1)] self.agent.fit(env=env, nb_steps=num_steps, action_repetition=opt.dqn_action_repetition, callbacks=callbacks, log_interval=opt.log_interval, test_interval=opt.test_interval, test_nb_episodes=opt.test_nb_episodes, test_action_repetition=opt.dqn_action_repetition, visualize=visualize, test_visualize=visualize, verbose=1) def test(self, env, num_episodes, visualize=False): self.agent.test(env=env, nb_episodes=num_episodes, action_repetition=opt.dqn_action_repetition, verbose=2, visualize=visualize) def save(self, out_dir): self.agent.save_weights(out_dir, overwrite=True) def load(self, out_dir): self.agent.load_weights(out_dir)
def train_dqn_model(layers, rounds=10000, run_test=False, use_score=False): ENV_NAME = 'malware-score-v0' if use_score else 'malware-v0' env = gym.make(ENV_NAME) env.seed(123) nb_actions = env.action_space.n window_length = 1 # "experience" consists of where we were, where we are now # generate a policy model model = generate_dense_model((window_length,) + env.observation_space.shape, layers, nb_actions) # configure and compile our agent # BoltzmannQPolicy selects an action stochastically with a probability generated by soft-maxing Q values policy = BoltzmannQPolicy() # memory can help a model during training # for this, we only consider a single malware sample (window_length=1) for each "experience" memory = SequentialMemory(limit=32, ignore_episode_boundaries=False, window_length=window_length) # DQN agent as described in Mnih (2013) and Mnih (2015). # http://arxiv.org/pdf/1312.5602.pdf # http://arxiv.org/abs/1509.06461 agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=16, enable_double_dqn=True, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy, batch_size=16) # keras-rl allows one to use and built-in keras optimizer agent.compile(RMSprop(lr=1e-3), metrics=['mae']) # play the game. learn something! agent.fit(env, nb_steps=rounds, visualize=False, verbose=2) history_train = env.history history_test = None if run_test: # Set up the testing environment TEST_NAME = 'malware-score-test-v0' if use_score else 'malware-test-v0' test_env = gym.make(TEST_NAME) # evaluate the agent on a few episodes, drawing randomly from the test samples agent.test(test_env, nb_episodes=100, visualize=False) history_test = test_env.history return agent, model, history_train, history_test
delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that you can the built-in Keras callbacks! weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(args.env_name) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [FileLogger(log_filename, interval=100)] dqn.fit(env, callbacks=callbacks, nb_steps=1750000, log_interval=10000, visualize=True) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True) # Finally, evaluate our algorithm for 10 episodes. dqn.test(env, nb_episodes=10, visualize=False) elif args.mode == 'test': weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) if args.weights: weights_filename = args.weights dqn.load_weights(weights_filename) dqn.test(env, nb_episodes=10, visualize=True)
outputFile = open("2105.csv", "w+") outputFile.write( "iteration,trainAccuracy,trainCoverage,trainReward,validationAccuracy,validationCoverage,validationReward\n" ) iteration = 0 for i in range(0, 100): dqn.fit(trainEnv, nb_steps=3000, visualize=False, callbacks=[trainer], verbose=0) (episodes, trainCoverage, trainAccuracy, trainReward) = trainer.getInfo() dqn.test(validationEnv, nb_episodes=300, verbose=0, callbacks=[validator], visualize=False) (episodes, validCoverage, validAccuracy, validReward) = validator.getInfo() outputFile.write( str(iteration) + "," + str(trainAccuracy) + "," + str(trainCoverage) + "," + str(trainReward) + "," + str(validAccuracy) + "," + str(validCoverage) + "," + str(validReward) + "\n") print( str(iteration) + " TRAIN: acc: " + str(trainAccuracy) + " cov: " + str(trainCoverage) + " rew: " + str(trainReward) + " VALID: acc: " + str(validAccuracy) + " cov: " + str(validCoverage) + " rew: " + str(validReward)) iteration += 1 validator.reset() trainer.reset()
class Agent(object): name = 'DQN' def __init__(self, number_of_training_steps=1e5, gamma=0.999, load_weights=False, visualize=False, dueling_network=True, double_dqn=True, nn_type='mlp', **kwargs): """ Agent constructor :param step_size: int, number of steps to take in env for a given simulation step :param window_size: int, number of lags to include in observation :param max_position: int, maximum number of positions able to be held in inventory :param fitting_file: str, file used for z-score fitting :param testing_file: str,file used for dqn experiment :param env: environment name :param seed: int, random seed number :param action_repeats: int, number of steps to take in environment between actions :param number_of_training_steps: int, number of steps to train agent for :param gamma: float, value between 0 and 1 used to discount future DQN returns :param format_3d: boolean, format observation as matrix or tensor :param train: boolean, train or test agent :param load_weights: boolean, import existing weights :param z_score: boolean, standardize observation space :param visualize: boolean, visiualize environment :param dueling_network: boolean, use dueling network architecture :param double_dqn: boolean, use double DQN for Q-value approximation """ # Agent arguments # self.env_name = id self.neural_network_type = nn_type self.load_weights = load_weights self.number_of_training_steps = number_of_training_steps self.visualize = visualize # Create environment self.env = gym.make(**kwargs) self.env_name = self.env.env.id # Create agent # NOTE: 'Keras-RL' uses its own frame-stacker self.memory_frame_stack = 1 # Number of frames to stack e.g., 1. self.model = self.create_model(name=self.neural_network_type) self.memory = SequentialMemory(limit=10000, window_length=self.memory_frame_stack) self.train = self.env.env.training self.cwd = os.path.dirname(os.path.realpath(__file__)) # create the agent self.agent = DQNAgent(model=self.model, nb_actions=self.env.action_space.n, memory=self.memory, processor=None, nb_steps_warmup=500, enable_dueling_network=dueling_network, dueling_type='avg', enable_double_dqn=double_dqn, gamma=gamma, target_model_update=1000, delta_clip=1.0) self.agent.compile(Adam(lr=float("3e-4")), metrics=['mae']) def __str__(self): # msg = '\n' # return msg.join(['{}={}'.format(k, v) for k, v in self.__dict__.items()]) return 'Agent = {} | env = {} | number_of_training_steps = {}'.format( Agent.name, self.env_name, self.number_of_training_steps) def create_model(self, name='cnn'): print("creating model for {}".format(name)) if name == 'cnn': return self._create_cnn_model() elif name == 'mlp': return self._create_mlp_model() def _create_cnn_model(self): """ Create a Convolutional neural network with dense layer at the end :return: keras model """ features_shape = (self.memory_frame_stack, *self.env.observation_space.shape) model = Sequential() conv = Conv2D model.add( conv(input_shape=features_shape, filters=16, kernel_size=[10, 1], padding='same', activation='relu', strides=[5, 1], data_format='channels_first')) model.add( conv(filters=16, kernel_size=[6, 1], padding='same', activation='relu', strides=[3, 1], data_format='channels_first')) model.add( conv(filters=16, kernel_size=[4, 1], padding='same', activation='relu', strides=[2, 1], data_format='channels_first')) model.add(Flatten()) model.add(Dense(256, activation='relu')) model.add(Dense(self.env.action_space.n, activation='softmax')) print(model.summary()) return model def _create_mlp_model(self): """ Create a DENSE neural network with dense layer at the end :return: keras model """ features_shape = (self.memory_frame_stack, *self.env.observation_space.shape) model = Sequential() model.add( Dense(units=256, input_shape=features_shape, activation='relu')) model.add(Dense(units=256, activation='relu')) model.add(Flatten()) model.add(Dense(self.env.action_space.n, activation='softmax')) print(model.summary()) return model def start(self): """ Entry point for agent training and testing :return: (void) """ output_directory = os.path.join(self.cwd, 'dqn_weights') if not os.path.exists(output_directory): print('{} does not exist. Creating Directory.'.format( output_directory)) os.mkdir(output_directory) weight_name = 'dqn_{}_{}_weights.h5f'.format(self.env_name, self.neural_network_type) weights_filename = os.path.join(output_directory, weight_name) print("weights_filename: {}".format(weights_filename)) if self.load_weights: print('...loading weights for {} from\n{}'.format( self.env_name, weights_filename)) self.agent.load_weights(weights_filename) if self.train: step_chkpt = '{step}.h5f' step_chkpt = 'dqn_{}_weights_{}'.format(self.env_name, step_chkpt) checkpoint_weights_filename = os.path.join(self.cwd, 'dqn_weights', step_chkpt) print("checkpoint_weights_filename: {}".format( checkpoint_weights_filename)) log_filename = os.path.join( self.cwd, 'dqn_weights', 'dqn_{}_log.json'.format(self.env_name)) print('log_filename: {}'.format(log_filename)) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [FileLogger(log_filename, interval=100)] print('Starting training...') self.agent.fit(self.env, callbacks=callbacks, nb_steps=self.number_of_training_steps, log_interval=10000, verbose=0, visualize=self.visualize) print("training over.") print('Saving AGENT weights...') self.agent.save_weights(weights_filename, overwrite=True) print("AGENT weights saved.") else: print('Starting TEST...') self.agent.test(self.env, nb_episodes=2, visualize=self.visualize)
def main(): ml_variables = FXU.getMLVariables() sqlEngine = FXU.getSQLEngine() reinforce_test_tablename = "reinforcetests" actions_table_details = { 'name': 'metaactions', 'col': ['Action', 'Time'], 'type': ['VARCHAR(20)', 'datetime'], 'null': [False, False] } ### Clear the actions table FXU.execute_query_db("DELETE FROM metaactions", sqlEngine) env = ForexEnv(type="train", inputSymbol="EURUSD", show_trade=True) env_test = ForexEnv(type="test", inputSymbol="EURUSD", show_trade=True) n_actions = env.action_space.n print("Number of actions : ", n_actions) model = create_model(shape=env.observation_space.shape, n_actions=n_actions) print(model.summary()) #### Configuring the agent memory = SequentialMemory(limit=100000, window_length=env.window_size) policy = EpsGreedyQPolicy() # enable the dueling network # you can specify the dueling_type to one of {'avg','max','naive'} dqn = DQNAgent(model=model, nb_actions=n_actions, memory=memory, nb_steps_warmup=100, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-4), metrics=['mae']) minPortfolioThreshold = 0.4 training_episodes_n = int(ml_variables['TrainingEpisodesNumber']) ##### Load weights if available to resume previous learning if ml_variables['LoadWeights'] != 'no': model_file_name = "model\\dqnTrainingWeights_{0}.h5f".format( env_test.symbol) if ospath.isfile(model_file_name): print( "Weights for the previous session exist, so Going to load the weights" ) dqn.load_weights(model_file_name) while True: ####### Load the best weights if available #################### """ if ml_variables['LoadWeights'] != 'no': ##### Get from DB the best Profit ######################### rs = FXU.getTableRows_db( "SELECT * FROM {0} WHERE Symbol = '{1}' AND MinPortfolio > {2} ORDER BY TotalProfit DESC".format(reinforce_test_tablename, env_test.symbol, ( minPortfolioThreshold * env_test.starting_balance))) firstRow = -1 for row in rs: firstRow = row break if firstRow != -1: print("Best value : ", firstRow['TotalProfit']) model_file_name = "model\\duel_dqn_reward_{0}_{1}.h5f".format(env_test.symbol, int(firstRow['TotalProfit'])) if ospath.isfile(model_file_name): print("Weights for the best profit : {0} exist, so Going to load the weights".format(int(firstRow['TotalProfit']))) dqn.load_weights(model_file_name) """ # Train : dqn.fit(env, nb_steps=(env.split_point * training_episodes_n), nb_max_episode_steps=60000, visualize=False, verbose=2) dqn.save_weights('./model/dqnTrainingWeights_{0}.h5f'.format( env.symbol), overwrite=True) try: info = dqn.test(env_test, nb_episodes=1, visualize=False) #reward = info.history['episode_reward'] reward = env_test.portfolio - env_test.starting_balance print("Total Profit : ", reward) now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # if reward > int(max_reward) and int(reward) != 0 and env_test.minPortfolio > (minPortfolioThreshold * env_test.starting_balance): # max_reward = int(reward) #np.array([info.history]).dump('./info/duel_dqn_reward_{0}_{1}.info'.format(env_test.symbol, max_reward)) if reward > 500 and env_test.minPortfolio > ( minPortfolioThreshold * env_test.starting_balance): dqn.save_weights('./model/duel_dqn_reward_{0}_{1}.h5f'.format( env_test.symbol, int(reward)), overwrite=True) #print("Info of testing : ",info.history) FXU.execute_query_db( "INSERT INTO {0}(Symbol,StartingBalance,TotalProfit,Time,MinPortfolio) VALUES('{1}','{2}','{3}','{4}','{5}')" .format(reinforce_test_tablename, env_test.symbol, env_test.starting_balance, reward, now, env_test.minPortfolio), sqlEngine) #n_buys, n_lostBuys, n_sells, n_lostSells, portfolio = info['buys'], info['lostBuys'], info['sells'], info['lostBuys'] #np.array([info]).dump('./info/duel_dqn_{0}_weights_{1}LS_{2}_{3}.info'.format(env_test.symbol, portfolio, n_buys, n_sells)) except KeyboardInterrupt: return
def main(): ml_variables = FXU.getMLVariables() actions_table_details = { 'name': 'metaactions', 'col': ['Action', 'Time'], 'type': ['VARCHAR(20)', 'datetime'], 'null': [False, False] } ### Clear the actions table FXU.execute_query_db("DELETE FROM metaactions") env = ForexEnv(type="train", inputSymbol="EURUSD", show_trade=True) env_test = ForexEnv(type="test", inputSymbol="EURUSD", show_trade=True) n_actions = env.action_space.n print("Number of actions : ", n_actions) model = create_model(shape=env.observation_space.shape, n_actions=n_actions) print(model.summary()) #### Configuring the agent memory = SequentialMemory(limit=100000, window_length=env.window_size) policy = EpsGreedyQPolicy() # enable the dueling network # you can specify the dueling_type to one of {'avg','max','naive'} dqn = DQNAgent(model=model, nb_actions=n_actions, memory=memory, nb_steps_warmup=100, enable_dueling_network=True, dueling_type='naive', target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-4), metrics=['mse']) if ml_variables['LoadWeights'] != 'no': path = 'model\\' + ml_variables['LoadWeights'] + ".h5f" if ospath.isfile(path): print("Weights exist, so Going to load the weights") dqn.load_weights(path) max_reward = -1000000 while True: # Train : dqn.fit(env, nb_steps=env.split_point, nb_max_episode_steps=60000, visualize=False, verbose=2) try: info = dqn.test(env_test, nb_episodes=1, visualize=False) #reward = info.history['episode_reward'] reward = env_test.balance - env_test.starting_balance print("reward : ", reward) if reward > int(max_reward) and int(reward) != 0: max_reward = int(reward) np.array([info.history ]).dump('./info/duel_dqn_reward_{0}_{1}.info'.format( env_test.symbol, max_reward)) dqn.save_weights('./model/duel_dqn_reward_{0}_{1}.h5f'.format( env_test.symbol, max_reward)) #print("Info of testing : ",info.history) #n_buys, n_lostBuys, n_sells, n_lostSells, portfolio = info['buys'], info['lostBuys'], info['sells'], info['lostBuys'] #np.array([info]).dump('./info/duel_dqn_{0}_weights_{1}LS_{2}_{3}.info'.format(env_test.symbol, portfolio, n_buys, n_sells)) except KeyboardInterrupt: return ##### Saving weights after each fitting to resume afterwards ############### if ml_variables['LoadWeights'] != 'no': dqn.save_weights(filepath='model\\' + ml_variables['LoadWeights'] + ".h5f", overwrite=True)
else: raise('Please select DQN, DUEL_DQN, SARSA, or CEM for your method type.') callbacks = [] # callbacks += [ModelIntervalCheckpoint(CHECKPOINT_WEIGHTS_FILENAME, interval=10000)] callbacks += [FileLogger(LOG_FILENAME, interval=100)] # callbacks += [TensorBoard(log_dir='logs', histogram_freq=0, write_graph=True, write_images=False)] callbacks += [ExploreExploit()] # Optionally, we can reload a previous model's weights and continue training from there # LOAD_WEIGHTS_FILENAME = 'weights/duel_dqn_planar_crane-v0_weights_1024_4_50000_2017-07-12_160853.h5f' # # # Load the model weights # agent.load_weights(LOAD_WEIGHTS_FILENAME) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent.fit(env, nb_steps=NUM_STEPS, callbacks=callbacks, action_repetition=5, visualize=False, verbose=1, log_interval=LOG_INTERVAL, nb_max_episode_steps=500) # After training is done, we save the final weights. agent.save_weights(WEIGHT_FILENAME, overwrite=True) # We'll also save a simply named version to make running test immediately # following training easier. filename = 'weights/{}_{}_weights.h5f'.format(METHOD, ENV_NAME) agent.save_weights(filename, overwrite=True) # Finally, evaluate our algorithm for 5 episodes. agent.test(env, nb_episodes=5, visualize=True, action_repetition=5) #nb_max_episode_steps=500,
model.add(Activation('relu')) model.add(Dense(nb_actions, activation='linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() # enable the dueling network # you can specify the dueling_type to one of {'avg','max','naive'} dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=False, verbose=2) # After training is done, we save the final weights. dqn.save_weights('duel_dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=False)
dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, enable_double_dqn=True, enable_dueling_network=True, nb_steps_warmup=1000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) #Prioritized Memories typically use lower learning rates dqn.compile(Adam(lr=.00025/4), metrics=['mae']) folder_path = './' mode = 'train' if mode == 'train': weights_filename = folder_path + 'pdd_dqn_{}_weights.h5f'.format(env_name) checkpoint_weights_filename = folder_path + 'pdd_dqn_' + env_name + '_weights_{step}.h5f' log_filename = folder_path + 'pdd_dqn_' + env_name + '_REWARD_DATA.txt' callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=500000)] callbacks += [TrainEpisodeLogger()] dqn.fit(env, callbacks=callbacks, nb_steps=10000000, verbose=0, nb_max_episode_steps=20000) elif mode == 'test': weights_filename = folder_path + 'pdd_dqn_MsPacmanDeterministic-v4_weights_10000000.h5f' dqn.load_weights(weights_filename) dqn.test(env, nb_episodes=10, visualize=True, nb_max_start_steps=80)
def attempt(lr, numTrainSteps, fnamePrefix, activation, exportVid, visualize): # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n print("env.observation_space.shape: " + str(env.observation_space.shape)) # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation(activation)) model.add(Dense(13)) model.add(Activation(activation)) model.add(Dense(10)) model.add(Activation(activation)) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=lr), metrics=['mae']) if not os.path.exists(fnamePrefix): os.makedirs(fnamePrefix) weights_fname = '%s/weights.h5f' % fnamePrefix if os.path.isfile(weights_fname): print("Loading weights from before") print("Skipping training") dqn.load_weights(weights_fname) else: # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=numTrainSteps, visualize=False, verbose=1) # After training is done, we save the final weights. dqn.save_weights(weights_fname, overwrite=True) # Finally, evaluate our algorithm for 5 episodes. env.reset() env.close() if exportVid: if not visualize: # print to stderr, since trainAll redirects stdout eprint( "Error: I don't think the video export works unless you choose visualize=True" ) videoFname = fnamePrefix + '/videos/' + str(time()) if not os.path.exists(videoFname): os.makedirs(videoFname) env = wrappers.Monitor(env, videoFname, force=True) result = dqn.test(env, nb_episodes=1, visualize=visualize) if exportVid: print("Video saved to %s" % videoFname) means = {'reward': mean(result.history['episode_reward'])} json_fname = fnamePrefix + '/result.json' with open(json_fname, "w") as f: json.dump(result.history, f) return (means)
checkpoint_weights_filename = 'dqn_' + Snake_env.name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(Snake_env.name) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=1000) ] callbacks += [FileLogger(log_filename, interval=1000)] weights = "dqn_" + Snake_env.name + "_weights_" + str(step) + ".h5f" #if weights: # weights_filename_1 = weights #dqn.load_weights(weights_filename_1) #訓練開始 dqn.fit(Snake_env, callbacks=callbacks, nb_steps=step, log_interval=1000, verbose=1) #把權重存起來 dqn.save_weights(weights_filename, overwrite=True) elif mode == 'test': #讀取權重 weights = "dqn_" + Snake_env.name + "_weights_" + str(step) + ".h5f" if weights: weights_filename = weights dqn.load_weights(weights_filename) dqn.test(Snake_env, nb_episodes=10, visualize=True)
class Agent(object): name = 'DQN' def __init__( self, step_size=1, window_size=20, max_position=5, fitting_file='ETH-USD_2018-12-31.xz', testing_file='ETH-USD_2018-01-01.xz', env='market-maker-v0', seed=1, action_repeats=4, number_of_training_steps=1e5, gamma=0.999, format_3d=False, # add 3rd dimension for CNNs train=True, weights=True, z_score=True, visualize=False, dueling_network=True, double_dqn=True): """ Agent constructor :param step_size: int, number of steps to take in env for a given simulation step :param window_size: int, number of lags to include in observation :param max_position: int, maximum number of positions able to be held in inventory :param fitting_file: str, file used for z-score fitting :param testing_file: str,file used for dqn experiment :param env: environment name :param seed: int, random seed number :param action_repeats: int, number of steps to take in environment between actions :param number_of_training_steps: int, number of steps to train agent for :param gamma: float, value between 0 and 1 used to discount future DQN returns :param format_3d: boolean, format observation as matrix or tensor :param train: boolean, train or test agent :param weights: boolean, import existing weights :param z_score: boolean, standardize observation space :param visualize: boolean, visiualize environment :param dueling_network: boolean, use dueling network architecture :param double_dqn: boolean, use double DQN for Q-value approximation """ self.env_name = env self.env = gym.make(self.env_name, fitting_file=fitting_file, testing_file=testing_file, step_size=step_size, max_position=max_position, window_size=window_size, seed=seed, action_repeats=action_repeats, training=train, z_score=z_score, format_3d=format_3d) # Number of frames to stack e.g., 1. # NOTE: 'Keras-RL' uses its own frame-stacker self.memory_frame_stack = 1 self.model = self.create_model() self.memory = SequentialMemory(limit=10000, window_length=self.memory_frame_stack) self.train = train self.number_of_training_steps = number_of_training_steps self.weights = weights self.cwd = os.path.dirname(os.path.realpath(__file__)) self.visualize = visualize # create the agent self.agent = DQNAgent(model=self.model, nb_actions=self.env.action_space.n, memory=self.memory, processor=None, nb_steps_warmup=500, enable_dueling_network=dueling_network, dueling_type='avg', enable_double_dqn=double_dqn, gamma=gamma, target_model_update=1000, delta_clip=1.0) self.agent.compile(Adam(lr=float("3e-4")), metrics=['mae']) def __str__(self): # msg = '\n' # return msg.join(['{}={}'.format(k, v) for k, v in self.__dict__.items()]) return 'Agent = {} | env = {} | number_of_training_steps = {}'.format( Agent.name, self.env_name, self.number_of_training_steps) def create_model(self): """ Create a Convolutional neural network with dense layer at the end :return: keras model """ features_shape = (self.memory_frame_stack, *self.env.observation_space.shape) model = Sequential() conv = Conv2D model.add( conv(input_shape=features_shape, filters=16, kernel_size=[10, 1], padding='same', activation='relu', strides=[5, 1], data_format='channels_first')) model.add( conv(filters=16, kernel_size=[6, 1], padding='same', activation='relu', strides=[3, 1], data_format='channels_first')) model.add( conv(filters=16, kernel_size=[4, 1], padding='same', activation='relu', strides=[2, 1], data_format='channels_first')) model.add(Flatten()) model.add(Dense(512)) model.add(Activation('linear')) model.add(Dense(self.env.action_space.n)) model.add(Activation('softmax')) print(model.summary()) return model def start(self): """ Entry point for agent training and testing :return: (void) """ weights_filename = '{}/dqn_weights/dqn_{}_weights.h5f'.format( self.cwd, self.env_name) if self.weights: self.agent.load_weights(weights_filename) print('...loading weights for {}'.format(self.env_name)) if self.train: checkpoint_weights_filename = 'dqn_{}'.format(self.env_name) + \ '_weights_{step}.h5f' checkpoint_weights_filename = '{}/dqn_weights/'.format(self.cwd) + \ checkpoint_weights_filename log_filename = '{}/dqn_weights/dqn_{}_log.json'.format( self.cwd, self.env_name) print('FileLogger: {}'.format(log_filename)) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [FileLogger(log_filename, interval=100)] print('Starting training...') self.agent.fit(self.env, callbacks=callbacks, nb_steps=self.number_of_training_steps, log_interval=10000, verbose=0, visualize=self.visualize) print('Saving AGENT weights...') self.agent.save_weights(weights_filename, overwrite=True) else: print('Starting TEST...') self.agent.test(self.env, nb_episodes=2, visualize=self.visualize)
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) processor = AtariProcessor() policy = policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1250000) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, enable_double_dqn=False, enable_dueling_network=False, nb_steps_warmup=1000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) folder_path = './machine-learning/pacman/' mode = 'test' if mode == 'train': weights_filename = folder_path + 'dqn_{}_weights.h5f'.format(env_name) checkpoint_weights_filename = folder_path + 'dqn_' + env_name + '_weights_{step}.h5f' log_filename = folder_path + 'dqn_' + env_name + '_REWARD_DATA.txt' callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=500000)] callbacks += [TrainEpisodeLogger()] dqn.fit(env, callbacks=callbacks, nb_steps=10000000, verbose=1, nb_max_episode_steps=20000) elif mode == 'test': weights_filename = folder_path + 'dqn_MsPacman-v0_weights_10000000.h5f' dqn.load_weights(weights_filename) dqn.test(env, nb_episodes=20, visualize=False, nb_max_start_steps=80)
class BaseAgent: dqn: DQNAgent def __init__(self, stock: str): self.env = gym.make('stockenv-v0', df=read_daily_data(stock)) print(self.env) print(self.env.action_space) print(self.env.observation_space) self.env.seed(123) self.stock = stock memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) processor = StockProcessor(stock) model = self.create_model(30) print("output:", model.output.shape) print("output2:", self.env.action_space.shape) print(list(model.output.shape)) print(list((None, self.env.action_space.shape))) self.dqn = DQNAgent(model=model, nb_actions=self.env.action_space.n, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) self.dqn.compile(Adam(lr=.00025), metrics=['mae']) def train(self): # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks! weights_filename = self.get_weight_path(self.stock) checkpoint_weights_filename = self.get_weight_path( self.stock) + '_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(self.stock) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [FileLogger(log_filename, interval=100)] # callbacks += [WandbLogger( # project = "stock-bot-v0" # )] self.dqn.fit(self.env, callbacks=callbacks, nb_steps=1750000, log_interval=10000) # After training is done, we save the final weights one more time. self.dqn.save_weights(weights_filename, overwrite=True) # Finally, evaluate our algorithm for 10 episodes. self.dqn.test(self.env, nb_episodes=10, visualize=False) def test(self): weights_filename = self.get_weight_path(self.stock) self.dqn.load_weights(weights_filename) self.dqn.test(self.env, nb_episodes=10, visualize=True) def get_weight_path(self, name: str) -> str: """Get weight path""" pass def create_model(self, input_size: int): """abstract""" pass
model.add(Activation('linear')) print(model.summary()) # モデル書き出し model_json_str = model.to_json() open('dqn_{}_model.json'.format(ENV_NAME), 'w').write(model_json_str) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=True, verbose=2) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=True) dqn.compute_q_values()
nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(RMSprop(lr=.00025), metrics=['mae']) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in tensorflow.keras callbacks! weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(args.env_name) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [FileLogger(log_filename, interval=100)] dqn.fit(env, callbacks=callbacks, nb_steps=20000000, log_interval=10000) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True) # Finally, evaluate our algorithm for 10 episodes. dqn.test(env, nb_episodes=10, visualize=False) elif args.mode == 'test': weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) if args.weights: weights_filename = args.weights dqn.load_weights(weights_filename) dqn.test(env, nb_episodes=50, visualize=False, nb_max_episode_steps=100)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=train_policy, test_policy=test_policy) filename = 'weights/duel_dqn_{}_weights_{}_{}_{}_{}.h5f'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID) else: dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=train_policy, test_policy=test_policy) filename = 'weights/dqn_{}_weights_{}_{}_{}_{}.h5f'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Optionally, we can reload a previous model's weights and continue training from there # FILENAME = 'weights/duel_dqn_variable_pendulum-v0_weights_4096_4_50000_2017-07-11_140316.h5f' # Load the model weights # dqn.load_weights(FILENAME) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=NUM_STEPS, visualize=False, verbose=2, nb_max_episode_steps=500) # After training is done, we save the final weights. dqn.save_weights(filename, overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, nb_max_episode_steps=500, visualize=True)
def dqn_rt(nonLinearLayers=3, neuronsPerLayer=4, epsilon=0.3, tau=1, exploration="tau", gamma=0.5): ENV_NAME = 'RtSimulationEnv-v0' # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) for x in range(0, nonLinearLayers): model.add(Activation('relu')) model.add(Dense(neuronsPerLayer)) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) if (exploration == "epsilon"): policy = EpsGreedyQPolicy(eps=epsilon) else: policy = BoltzmannQPolicy(tau=tau) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy, gamma=gamma) #dqn = SarsaAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) #dqn = DDPGAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! If we visualize the training here for show, this # slows down training quite a lot. We can also always safely abort the training prematurely using # Ctrl + C. env.testing = False foo = dqn.fit(env, nb_steps=1000, visualize=False, verbose=3) d = datetime.utcnow() unixtime = calendar.timegm(d.utctimetuple()) ###save env.unwrapped.totalStates and env.unwrapped.actions as: [state,action] pairs with open('simulatedRTs_' + str(unixtime) + '.csv', 'wb') as f: writer = csv.writer(f) writer.writerows([ env.unwrapped.totalStates, env.unwrapped.rewards, env.unwrapped.actions ]) # After training is done, we save the final weights. # dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 10 episodes. env.testing = True dqn.test(env, nb_episodes=5, visualize=False)
log_interval=1000) # Do early stopping fit test_scores = [] train_scores = [] no_improvement = 0 f = open('log', 'w') f.write('best'.rjust(10) + 'test'.rjust(10) + 'train'.rjust(10) + '\n') f.flush() # Find test score agent.test(test_env, nb_episodes=100, nb_max_episode_steps=n_steps, visualize=False) test_score = np.mean(test_env.value_history[-100:]) test_scores.append(test_score) # Find train score agent.test(train_env, nb_episodes=100, nb_max_episode_steps=n_steps, visualize=False) train_score = np.mean(train_env.value_history[-100:]) train_scores.append(train_score) best_score = min(test_score, train_score) f.write('{0:.5f}'.format(best_score).rjust(10) +
ENV_NAME = 'FrozenLake-v0' env = gym.make(ENV_NAME) np.random.seed(1) env.seed(1) Actions = env.action_space.n model = Sequential() model.add(Embedding(16, 4, input_length=1)) model.add(Reshape((4,))) print(model.summary()) memory = SequentialMemory(limit=10000, window_length=1) policy = BoltzmannQPolicy() Dqn = DQNAgent(model=model, nb_actions=Actions, memory=memory, nb_steps_warmup=500, target_model_update=1e-2, policy=policy, enable_double_dqn=False, batch_size=512 ) Dqn.compile(Adam()) Dqn.fit(env, nb_steps=1e5, visualize=False, verbose=1, log_interval=10000) Dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) Dqn.test(env, nb_episodes=20, visualize=False)
def training_game(): env = Environment( map_name="ForceField", visualize=True, game_steps_per_episode=150, agent_interface_format=features.AgentInterfaceFormat( feature_dimensions=features.Dimensions(screen=64, minimap=32))) input_shape = (_SIZE, _SIZE, 1) nb_actions = 12 # Number of actions model = neural_network_model(input_shape, nb_actions) memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH) processor = SC2Proc() # Policy policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=1, value_min=0.2, value_test=.0, nb_steps=1e2) # Agent dqn = DQNAgent( model=model, nb_actions=nb_actions, memory=memory, enable_double_dqn=True, enable_dueling_network=True, # 2019-07-12 GU Zhan (Sam) # nb_steps_warmup=500, target_model_update=1e-2, policy=policy, nb_steps_warmup=1500, target_model_update=1e-2, policy=policy, batch_size=150, processor=processor, delta_clip=1) dqn.compile(Adam(lr=.001), metrics=["mae", "acc"]) # Tensorboard callback timestamp = f"{datetime.datetime.now():%Y-%m-%d %I:%M%p}" # 2019-07-12 GU Zhan (Sam) folder name for Lunux: # callbacks = keras.callbacks.TensorBoard(log_dir='./Graph/'+ timestamp, histogram_freq=0, # write_graph=True, write_images=False) # 2019-07-12 GU Zhan (Sam) folder name for Windows: callbacks = keras.callbacks.TensorBoard(log_dir='.\Graph\issgz', histogram_freq=0, write_graph=True, write_images=False) # Save the parameters and upload them when needed name = "agent" w_file = "dqn_{}_weights.h5f".format(name) check_w_file = "train_w" + name + "_weights.h5f" if SAVE_MODEL: check_w_file = "train_w" + name + "_weights_{step}.h5f" log_file = "training_w_{}_log.json".format(name) if LOAD_MODEL: dqn.load_weights(w_file) class Saver(Callback): def on_episode_end(self, episode, logs={}): if episode % 200 == 0: self.model.save_weights(w_file, overwrite=True) s = Saver() logs = FileLogger('DQN_Agent_log.csv', interval=1) # dqn.fit(env, callbacks=[callbacks,s,logs], nb_steps=600, action_repetition=2, dqn.fit(env, callbacks=[callbacks, s, logs], nb_steps=3000, action_repetition=2, log_interval=1e4, verbose=2) dqn.save_weights(w_file, overwrite=True) dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
import rl.callbacks class EpisodeLogger(rl.callbacks.Callback): def __init__(self): self.observations = {} self.rewards = {} self.actions = {} def on_episode_begin(self, episode, logs): self.observations[episode] = [] self.rewards[episode] = [] self.actions[episode] = [] def on_step_end(self, step, logs): episode = logs['episode'] self.observations[episode].append(logs['observation']) self.rewards[episode].append(logs['reward']) self.actions[episode].append(logs['action']) cb_ep = EpisodeLogger() dqn.test(env, nb_episodes=10, visualize=False, callbacks=[cb_ep]) import matplotlib.pyplot as plt for obs in cb_ep.observations.values(): plt.plot([o[0] for o in obs]) plt.xlabel("step") plt.ylabel("pos")
class DistopiaDQN: def __init__(self, env_name='distopia-initial4-v0', in_path=None, out_path=None, terminate_on_fail=False, reconstruct=False): self.ENV_NAME = env_name self.filename = self.ENV_NAME self.init_paths(in_path, out_path) self.init_env(terminate_on_fail) self.init_model(reconstruct) self.compile_agent() def init_paths(self, in_path, out_path): self.in_path = in_path #if self.in_path != None else './' self.out_path = out_path if out_path != None else './' self.log_path = "./logs/{}".format(time.time()) os.mkdir(self.log_path) def init_env(self, terminate_on_fail): self.env = gym.make(self.ENV_NAME) self.env.terminate_on_fail = terminate_on_fail self.env.record_path = "{}/ep_".format(self.log_path) self.env = gym.wrappers.Monitor(self.env, "recording", force=True) np.random.seed(234) self.env.seed(234) self.nb_actions = np.sum(self.env.action_space.nvec) self.num_actions = self.env.NUM_DIRECTIONS self.num_blocks = self.env.NUM_DISTRICTS * self.env.BLOCKS_PER_DISTRICT def init_model(self, reconstruct=False): if self.in_path != None: if reconstruct == True: self.construct_model() else: yaml_file = open( "{}/{}.yaml".format(self.in_path, self.filename), 'r') model_yaml = yaml_file.read() yaml_file.close() self.model = model_from_yaml(model_yaml) self.model.load_weights("{}/{}.h5".format(self.in_path, self.filename)) else: # Next, we build a very simple model. self.construct_model() self.save_model() print(self.model.summary()) def construct_model(self): self.model = Sequential() self.model.add( Flatten(input_shape=(1, ) + self.env.observation_space.shape)) self.model.add(Dense(64)) self.model.add(Activation('relu')) self.model.add(Dense(64)) self.model.add(Activation('relu')) # self.model.add(Dense(16)) # self.model.add(Activation('relu')) self.model.add(Dense(self.nb_actions)) self.model.add(Activation('linear')) def save_model(self): if self.out_path != None: with open(self.filename + ".yaml", 'w+') as yaml_file: yaml_file.write(self.model.to_yaml()) self.model.save_weights('{}/{}.h5'.format(self.out_path, self.ENV_NAME)) def compile_agent(self): # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! processor = DistopiaProcessor(self.num_blocks, self.num_actions) memory = SequentialMemory(limit=50000, window_length=1) #policy = PatchedBoltzmannQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks) #test_policy = PatchedGreedyQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks) policy = RandomPolicy() test_policy = GreedyQPolicy() self.dqn = DQNAgent(model=self.model, processor=processor, nb_actions=self.nb_actions, memory=memory, nb_steps_warmup=1000, enable_double_dqn=True, target_model_update=1e-2, policy=policy, test_policy=test_policy, gamma=0.9) self.dqn.compile(Adam(lr=1e-3), metrics=['mae']) def train(self, max_steps=100, episodes=100): # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. self.env._max_steps = max_steps #for i in range(episodes): self.env.current_step = 0 n_steps = max_steps * episodes logger = FileLogger( filepath='{}/{}.json'.format(self.out_path, self.ENV_NAME)) self.dqn.fit(self.env, nb_steps=n_steps, nb_max_episode_steps=max_steps, visualize=False, verbose=1, callbacks=[logger]) #self.env.reset() # After episode is done, we save the final weights. self.dqn.save_weights('{}/{}.h5'.format(self.out_path, self.ENV_NAME), overwrite=True) def test(self): # Finally, evaluate our algorithm for 5 episodes. self.dqn.test(self.env, nb_episodes=5, nb_max_start_steps=0, visualize=True)
print('connector shape', connector.shape) ## Environment parameters observation_shape = market.observation_space.shape nb_actions = market.action_space.n print('state =', observation_shape, '| actions =', nb_actions) ## Init ML-model for agent model = simple_model(observation_shape, nb_actions) ## Init RL-metod parameters memory = SequentialMemory(limit=10000, window_length=1) policy = BoltzmannQPolicy() ## Init RL agent agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000, target_model_update=1e-2, policy=policy, # enable_dueling_network=True, dueling_type='avg' ) agent.compile(Adam(lr=1e-3), metrics=['mae']) ## Train and evaluation # agent.load_weights('dqn_{}_weights.h5f'.format(ENV_NAME)) agent.fit(market, nb_steps=100000, visualize=False, verbose=2) agent.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) agent.test(market, nb_episodes=5, visualize=False)
target_model_update=10000, train_interval=4, delta_clip=1.) #Prioritized Memories typically use lower learning rates dqn.compile(Adam(lr=.00025 / 4), metrics=['mae']) folder_path = '../model_saves/PDD/' if args.mode == 'train': weights_filename = folder_path + 'pdd_dqn_{}_weights.h5f'.format( args.env_name) checkpoint_weights_filename = folder_path + 'pdd_dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = folder_path + 'pdd_dqn_' + args.env_name + '_REWARD_DATA.txt' callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=500000) ] callbacks += [TrainEpisodeLogger(log_filename)] dqn.fit(env, callbacks=callbacks, nb_steps=10000000, verbose=0, nb_max_episode_steps=20000) elif args.mode == 'test': weights_filename = folder_path + 'pdd_dqn_MsPacmanDeterministic-v4_weights_10000000.h5f' if args.weights: weights_filename = args.weights dqn.load_weights(weights_filename) dqn.test(env, nb_episodes=10, visualize=True, nb_max_start_steps=80)
#policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=num_actions, memory=memory, nb_steps_warmup=window * 3, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) dqn.enable_dueling_network = True if model_exist: dqn.load_weights(model_path) dqn.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=.5, value_min=.1, value_test=.05, nb_steps=5000) env.set_data_interval(train_start, train_end) train_history = dqn.fit(env, nb_steps=5000, visualize=False, verbose=2, action_repetition=5) env.set_data_interval(train_start, test_end) print('Whole') train_history = dqn.test(env, nb_episodes=2) dqn.save_weights(model_path, overwrite=True) env.save_action_plot('action_validate.csv') plt.axvline(x=train_end - train_start) plt.show()
model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=5000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=2500, visualize=True, verbose=2) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(Monitor(env, '.'), nb_episodes=5, visualize=True)
# Okay, now it's time to learn something! #We visualize the training here for show, but this # slows down training quite a lot. #You can always safely abort the training prematurely using Ctrl + C. history_0 = dqn.fit(env, nb_steps=175000, visualize=False, verbose=2, nb_max_episode_steps=10000) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. history_1 = dqn.test(env, nb_episodes=10, visualize=False) scipy.io.savemat('history_0.mat', history_0.history, appendmat=True, format='5', long_field_names=False, do_compression=False, oned_as='row') scipy.io.savemat('history_1.mat', history_1.history, appendmat=True, format='5', long_field_names=False, do_compression=False, oned_as='row')
class DQNKeras(AbstractAgent): def __init__(self, env, callbacks=None, timesteps_per_episode=60, batch_size=32): super().__init__(env, timesteps_per_episode) self.action_size = env.action_space.n self.state_size = env.num_states self.callbacks = callbacks np.random.seed(123) if hasattr(env, '_seed'): env._seed(123) # Build networks self.model = self._build_compile_model() memory = SequentialMemory(limit=50000, window_length=1) policy = EpsGreedyQPolicy() self.dqn_only_embedding = DQNAgent(model=self.model, nb_actions=self.action_size, memory=memory, nb_steps_warmup=500, target_model_update=1e-2, policy=policy) def _build_compile_model(self): model = Sequential() model.add(Embedding(self.state_size, 10, input_length=1)) # 600000 model.add(Reshape((10, ))) # model.add(Flatten()) model.add(Dense(50, activation='relu')) model.add(Dense(50, activation='relu')) model.add(Dense(50, activation='relu')) model.add(Dense(self.action_size, activation='linear')) # print(model.summary()) return model def run(self) -> {str: float}: """ The agent's training method. Returns: a dictionary - {"episode_reward_mean": __, "episode_reward_min": __, "episode_reward_max": __, "episode_len_mean": __} """ self.dqn_only_embedding.compile(Adam(lr=1e-3), metrics=['mae']) history = self.dqn_only_embedding.fit(self.env, nb_steps=ITER_NUM, visualize=False, verbose=1, nb_max_episode_steps=ITER_NUM, log_interval=10000) # history = self.dqn_only_embedding.fit(self.env, nb_steps=2000, visualize=False, verbose=1, # nb_max_episode_steps=60, log_interval=2000) result = { EPISODE_REWARD_MEAN: np.array(history.history["episode_reward"]), EPISODE_STEP_NUM_MEAN: np.array(history.history["nb_episode_steps"]), EPISODE_REWARD_MIN: np.empty([]), EPISODE_REWARD_MAX: np.empty([]), EPISODE_VARIANCE: np.empty([]) } return result def compute_action(self, state) -> int: """ Computes the best action from a given state. Returns: a int that represents the best action. """ # self.epsilon *= self.epsilon_decay # self.epsilon = max(self.epsilon_min, self.epsilon) # if np.random.random() < self.epsilon: # return self.env.action_space.sample() state = np.array([[state]]) return int(np.argmax(self.model.predict(state))) def stop_episode(self): pass def episode_callback(self, state, action, reward, next_state, terminated): pass def evaluate(self, visualize=True): self.dqn_only_embedding.test(self.env, nb_episodes=1, visualize=visualize, nb_max_episode_steps=70) def load_existing_agent(self, dir_path): self.model.load_weights(dir_path) self.dqn_only_embedding.compile(Adam(lr=1e-3), metrics=['mae']) return self
# class TestCallback(Callback): # def on_epoch_end(self, epoch, logs=None): # test_env = gym.make(args.env_name) # test_env.setMapSize(MAP_X,MAP_Y) # dqn.test(test_env, nb_episodes=1, visualize=True, nb_max_start_steps=100) # test_env.win1.destroy() # test_env.close() # del(test_env) # callbacks += [TestCallback()] # if args.loadmodel: # dqn.model.load(args.loadmodel) if args.weights: dqn.load_weights(args.weights) dqn.fit(env, callbacks=callbacks, nb_steps=1750000, log_interval=10000) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True) # dqn.save_model(model_filename) # Finally, evaluate our algorithm for 10 episodes. dqn.test(env, nb_episodes=10, visualize=True) # gtk.main() elif args.mode == 'test': weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) if args.weights: weights_filename = args.weights dqn.load_weights(weights_filename) dqn.test(env, nb_episodes=10, visualize=True, nb_max_start_steps=00)
class Agent(object): name = 'DQN' def __init__( self, step_size=1, window_size=20, train=True, max_position=5, weights=True, fitting_file='ETH-USD_2018-12-31.xz', testing_file='ETH-USD_2018-01-01.xz', format_3d=False, # add 3rd dimension for CNNs env='market-maker-v0', seed=1, action_repeats=4, number_of_training_steps=1e5, visualize=False): self.env_name = env self.env = gym.make(self.env_name, training=train, fitting_file=fitting_file, testing_file=testing_file, step_size=step_size, max_position=max_position, window_size=window_size, seed=seed, action_repeats=action_repeats, format_3d=format_3d) # Number of frames to stack e.g., 1; Keras-RL uses its own stacker self.memory_frame_stack = 1 self.model = self.create_model() self.memory = SequentialMemory(limit=10000, window_length=self.memory_frame_stack) self.train = train self.number_of_training_steps = number_of_training_steps self.weights = weights self.cwd = os.path.dirname(os.path.realpath(__file__)) self.visualize = visualize # create the agent self.agent = DQNAgent(model=self.model, nb_actions=self.env.action_space.n, memory=self.memory, processor=None, nb_steps_warmup=500, enable_dueling_network=True, dueling_type='avg', enable_double_dqn=True, gamma=0.999, target_model_update=1000, delta_clip=1.0) self.agent.compile(RMSprop(lr=0.00048), metrics=['mae']) def __str__(self): # msg = '\n' # return msg.join(['{}={}'.format(k, v) for k, v in self.__dict__.items()]) return 'Agent = {} | env = {} | number_of_training_steps = {}'.format( Agent.name, self.env_name, self.number_of_training_steps) def create_model(self): features_shape = (self.memory_frame_stack, *self.env.observation_space.shape) model = Sequential() conv = Conv2D model.add( conv(input_shape=features_shape, filters=16, kernel_size=8, padding='same', activation='relu', strides=4, data_format='channels_first')) model.add( conv(filters=32, kernel_size=4, padding='same', activation='relu', strides=2, data_format='channels_first')) model.add( conv(filters=32, kernel_size=2, padding='same', activation='relu', strides=1, data_format='channels_first')) model.add(Flatten()) model.add(Dense(256)) model.add(Activation('linear')) model.add(Dense(self.env.action_space.n)) model.add(Activation('softmax')) print(model.summary()) return model def start(self): weights_filename = '{}/dqn_weights/dqn_{}_weights.h5f'.format( self.cwd, self.env_name) if self.weights: self.agent.load_weights(weights_filename) print('...loading weights for {}'.format(self.env_name)) if self.train: checkpoint_weights_filename = 'dqn_' + self.env_name + \ '_weights_{step}.h5f' checkpoint_weights_filename = '{}/dqn_weights/'.format(self.cwd) + \ checkpoint_weights_filename log_filename = '{}/dqn_weights/dqn_{}_log.json'.format( self.cwd, self.env_name) print('FileLogger: {}'.format(log_filename)) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [FileLogger(log_filename, interval=100)] print('Starting training...') self.agent.fit(self.env, callbacks=callbacks, nb_steps=self.number_of_training_steps, log_interval=10000, verbose=0, visualize=self.visualize) print('Saving AGENT weights...') self.agent.save_weights(weights_filename, overwrite=True) else: print('Starting TEST...') self.agent.test(self.env, nb_episodes=2, visualize=self.visualize)
class DeepQTrading: #Class constructor #model: Keras model considered #Explorations is a vector containing the policy of the probability of random predictions plus how many epochs will be # runned by the algorithm (we run the algorithm several times-several iterations) #trainSize: size of the training set #validationSize: size of the validation set #testSize: size of the testing set #outputFile: name of the file to print results #begin: Initial date #end: final date #nbActions: number of decisions (0-Hold 1-Long 2-Short) #nOutput is the number of walks. Tonio put 20 but it is 5 walks in reality. #operationCost: Price for the transaction #telegramToken: token used for the bot that will send messages #telegramChatID: ID of messager receiver in Telegram #ensemble.py runs the ensemble def __init__(self, model, explorations, trainSize, validationSize, testSize, outputFile, begin, end, nbActions, nOutput=1, operationCost=0,telegramToken="",telegramChatID=""): #If the telegram token for the bot and the telegram id of the receiver are empty, try to send a message #otherwise print error if(telegramToken!="" and telegramChatID!=""): self.chatID=telegramChatID self.telegramOutput=True try: self.bot = telegram.Bot(token=telegramToken) except: print("Error with Telegram Bot") #If they are not empty, prepare the bot to send messages else: self.telegramOutput=True #Define the policy, explorations, actions and model as received by parameters self.policy = EpsGreedyQPolicy() self.explorations=explorations self.nbActions=nbActions self.model=model #Define the memory self.memory = SequentialMemory(limit=10000, window_length=1) #Instantiate the agent with parameters received self.agent = DQNAgent(model=self.model, policy=self.policy, nb_actions=self.nbActions, memory=self.memory, nb_steps_warmup=200, target_model_update=1e-1, enable_double_dqn=True,enable_dueling_network=True) #Compile the agent with the adam optimizer and with the mean absolute error metric self.agent.compile(Adam(lr=1e-3), metrics=['mae']) #Save the weights of the agents in the q.weights file #Save random weights self.agent.save_weights("q.weights", overwrite=True) #Define the current starting point as the initial date self.currentStartingPoint = begin #Define the training, validation and testing size as informed by the call #Train: five years #Validation: 6 months #Test: 6 months self.trainSize=trainSize self.validationSize=validationSize self.testSize=testSize #The walk size is simply summing up the train, validation and test sizes self.walkSize=trainSize+validationSize+testSize #Define the ending point as the final date (January 1st of 2010) self.endingPoint=end #Read the hourly dataset #We join data from different files #Here read hour self.dates= pd.read_csv('./dataset/'+MK+'Hour.csv') #Read the hourly dataset self.sp = pd.read_csv('./dataset/'+MK+'Hour.csv') #Convert the pandas format to date and time format self.sp['Datetime'] = pd.to_datetime(self.sp['Date'] + ' ' + self.sp['Time']) #Set an index to Datetime on the pandas loaded dataset. Register will be indexes through this value self.sp = self.sp.set_index('Datetime') #Drop Time and Date from the Dataset self.sp = self.sp.drop(['Time','Date'], axis=1) #Just the index will be important, because date and time will be used to define the train, validation and test #for each walk self.sp = self.sp.index #Receives the operation cost which is 0 #Operation cost is the cost for long and short. It is defined as zero self.operationCost = operationCost #Call the callback for training, validation and test in order to show the results for each episode self.trainer=ValidationCallback() self.validator=ValidationCallback() self.tester=ValidationCallback() #Initiate the output file self.outputFile=[] #Write in the file for i in range(0,nOutput): self.outputFile.append(open(outputFile+str(i+1)+".csv", "w+")) #Write the fields in the file self.outputFile[i].write( "Iteration,"+ "trainAccuracy,"+ "trainCoverage,"+ "trainReward,"+ "trainLong%,"+ "trainShort%,"+ "trainLongAcc,"+ "trainShortAcc,"+ "trainLongPrec,"+ "trainShortPrec,"+ "validationAccuracy,"+ "validationCoverage,"+ "validationReward,"+ "validationLong%,"+ "validationShort%,"+ "validationLongAcc,"+ "validationShortAcc,"+ "validLongPrec,"+ "validShortPrec,"+ "testAccuracy,"+ "testCoverage,"+ "testReward,"+ "testLong%,"+ "testShort%,"+ "testLongAcc,"+ "testShortAcc,"+ "testLongPrec,"+ "testShortPrec\n") def run(self): #Initiate the training, trainEnv=validEnv=testEnv=" " iteration=-1 #While we did not pass through all the dates (i.e., while all the walks were not finished) #walk size is train+validation+test size #currentStarting point begins with begin date while(self.currentStartingPoint+self.walkSize <= self.endingPoint): #Iteration is a walks iteration+=1 #Send to the receiver the current walk if(self.telegramOutput): self.bot.send_message(chat_id=self.chatID, text="Walk "+str(iteration + 1 )+" started.") #Empty the memory and agent del(self.memory) del(self.agent) #Define the memory and agent #Memory is Sequential self.memory = SequentialMemory(limit=10000, window_length=1) #Agent is initiated as passed through parameters self.agent = DQNAgent(model=self.model, policy=self.policy, nb_actions=self.nbActions, memory=self.memory, nb_steps_warmup=200, target_model_update=1e-1, enable_double_dqn=True,enable_dueling_network=True) #Compile the agent with Adam initialization self.agent.compile(Adam(lr=1e-3), metrics=['mae']) #Load the weights saved before in a random way if it is the first time self.agent.load_weights("q.weights") ########################################TRAINING STAGE######################################################## #The TrainMinLimit will be loaded as the initial date at the beginning, and will be updated later. #If the initial date cannot be used, add 1 hour to the initial date and consider it the initial date trainMinLimit=None while(trainMinLimit is None): try: trainMinLimit = self.sp.get_loc(self.currentStartingPoint) except: self.currentStartingPoint+=datetime.timedelta(0,0,0,0,0,1,0) #The TrainMaxLimit will be loaded as the interval between the initial date plus the training size. #If the initial date cannot be used, add 1 hour to the initial date and consider it the initial date trainMaxLimit=None while(trainMaxLimit is None): try: trainMaxLimit = self.sp.get_loc(self.currentStartingPoint+self.trainSize) except: self.currentStartingPoint+=datetime.timedelta(0,0,0,0,0,1,0) ########################################VALIDATION STAGE####################################################### #The ValidMinLimit will be loaded as the TrainMax limit validMinLimit=trainMaxLimit+1 #The ValidMaxLimit will be loaded as the interval after the begin + train size +validation size #If the initial date cannot be used, add 1 hour to the initial date and consider it the initial date validMaxLimit=None while(validMaxLimit is None): try: validMaxLimit = self.sp.get_loc(self.currentStartingPoint+self.trainSize+self.validationSize) except: self.currentStartingPoint+=datetime.timedelta(0,0,0,0,0,1,0) ########################################TESTING STAGE######################################################## #The TestMinLimit will be loaded as the ValidMaxlimit testMinLimit=validMaxLimit+1 #The testMaxLimit will be loaded as the interval after the begin + train size +validation size + Testsize #If the initial date cannot be used, add 1 hour to the initial date and consider it the initial date testMaxLimit=None while(testMaxLimit is None): try: testMaxLimit = self.sp.get_loc(self.currentStartingPoint+self.trainSize+self.validationSize+self.testSize) except: self.currentStartingPoint+=datetime.timedelta(0,0,0,0,0,1,0) #Separate the Validation and testing data according to the limits found before #Prepare the training and validation files for saving them later ensambleValid=pd.DataFrame(index=self.dates[validMinLimit:validMaxLimit].ix[:,'Date'].drop_duplicates().tolist()) ensambleTest=pd.DataFrame(index=self.dates[testMinLimit:testMaxLimit].ix[:,'Date'].drop_duplicates().tolist()) #Put the name of the index for validation and testing ensambleValid.index.name='Date' ensambleTest.index.name='Date' #Explorations are epochs, for eps in self.explorations: #policy will be 0.2, so the randomness of predictions (actions) will happen with 20% of probability self.policy.eps = eps[0] #there will be 100 iterations, or eps[1]) for i in range(0,eps[1]): del(trainEnv) #Define the training, validation and testing environments with their respective callbacks trainEnv = SpEnv(operationCost=self.operationCost,minLimit=trainMinLimit,maxLimit=trainMaxLimit,callback=self.trainer) del(validEnv) validEnv=SpEnv(operationCost=self.operationCost,minLimit=validMinLimit,maxLimit=validMaxLimit,callback=self.validator,ensamble=ensambleValid,columnName="iteration"+str(i)) del(testEnv) testEnv=SpEnv(operationCost=self.operationCost,minLimit=testMinLimit,maxLimit=testMaxLimit,callback=self.tester,ensamble=ensambleTest,columnName="iteration"+str(i)) #Reset the callback self.trainer.reset() self.validator.reset() self.tester.reset() #Reset the training environment trainEnv.resetEnv() #Train the agent self.agent.fit(trainEnv,nb_steps=floor(self.trainSize.days-self.trainSize.days*0.2),visualize=False,verbose=0) #Get the info from the train callback (_,trainCoverage,trainAccuracy,trainReward,trainLongPerc,trainShortPerc,trainLongAcc,trainShortAcc,trainLongPrec,trainShortPrec)=self.trainer.getInfo() #Print Callback values on the screen print(str(i) + " TRAIN: acc: " + str(trainAccuracy)+ " cov: " + str(trainCoverage)+ " rew: " + str(trainReward)) #Reset the validation environment validEnv.resetEnv() #Test the agent on validation data self.agent.test(validEnv,nb_episodes=floor(self.validationSize.days-self.validationSize.days*0.2),visualize=False,verbose=0) #Get the info from the validation callback (_,validCoverage,validAccuracy,validReward,validLongPerc,validShortPerc,validLongAcc,validShortAcc,validLongPrec,validShortPrec)=self.validator.getInfo() #Print callback values on the screen print(str(i) + " VALID: acc: " + str(validAccuracy)+ " cov: " + str(validCoverage)+ " rew: " + str(validReward)) #Reset the testing environment testEnv.resetEnv() #Test the agent on testing data self.agent.test(testEnv,nb_episodes=floor(self.validationSize.days-self.validationSize.days*0.2),visualize=False,verbose=0) #Get the info from the testing callback (_,testCoverage,testAccuracy,testReward,testLongPerc,testShortPerc,testLongAcc,testShortAcc,testLongPrec,testShortPrec)=self.tester.getInfo() #Print callback values on the screen print(str(i) + " TEST: acc: " + str(testAccuracy)+ " cov: " + str(testCoverage)+ " rew: " + str(testReward)) print(" ") #write the walk data on the text file self.outputFile[iteration].write( str(i)+","+ str(trainAccuracy)+","+ str(trainCoverage)+","+ str(trainReward)+","+ str(trainLongPerc)+","+ str(trainShortPerc)+","+ str(trainLongAcc)+","+ str(trainShortAcc)+","+ str(trainLongPrec)+","+ str(trainShortPrec)+","+ str(validAccuracy)+","+ str(validCoverage)+","+ str(validReward)+","+ str(validLongPerc)+","+ str(validShortPerc)+","+ str(validLongAcc)+","+ str(validShortAcc)+","+ str(validLongPrec)+","+ str(validShortPrec)+","+ str(testAccuracy)+","+ str(testCoverage)+","+ str(testReward)+","+ str(testLongPerc)+","+ str(testShortPerc)+","+ str(testLongAcc)+","+ str(testShortAcc)+","+ str(testLongPrec)+","+ str(testShortPrec)+"\n") #Close the file self.outputFile[iteration].close() #For the next walk, the current starting point will be the current starting point + the test size #It means that, for the next walk, the training data will start 6 months after the training data of #the previous walk self.currentStartingPoint+=self.testSize #Write validation and Testing Data into files #Save the files for processing later with the ensemble ensambleValid.to_csv("./Output/ensemble/walk"+str(iteration)+"ensemble_valid.csv") ensambleTest.to_csv("./Output/ensemble/walk"+str(iteration)+"ensemble_test.csv") #Function to end the Agent def end(self): import os #Close the files where the results were written for outputFile in self.outputFile: outputFile.close()
policy = LinearAnnealedPolicy(policy, attr='eps', value_max=eps, value_min=0, value_test=0, nb_steps=nb_steps) test_policy = GreedyQPolicy() #################################################################### dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=window_length + batch_size, target_model_update=0.02, policy=policy, test_policy=test_policy, batch_size=batch_size, train_interval=train_interval, gamma=gamma) dqn.compile(Adam(lr=0.00025), metrics=['mae']) dqn.load_weights('dqn_{}_weights.h5f'.format('lunar')) #################################################################### nb_episodes = 10 history = dqn.test(env, nb_episodes=nb_episodes)
def build_train_test(args, timesteps): # Get the environment and extract the number of actions. env = gym.make(args.env_name) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n # Next, we build our model. We use the same model that was described by Mnih et al. (2015). input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE model = Sequential() if K.image_dim_ordering() == 'tf': # (width, height, channels) model.add(Permute((2, 3, 1), input_shape=input_shape)) elif K.image_dim_ordering() == 'th': # (channels, width, height) model.add(Permute((1, 2, 3), input_shape=input_shape)) else: raise RuntimeError('Unknown image_dim_ordering.') model.add(Convolution2D(32, (8, 8), strides=(4, 4))) model.add(Activation('relu')) model.add(Convolution2D(64, (4, 4), strides=(2, 2))) model.add(Activation('relu')) model.add(Convolution2D(64, (3, 3), strides=(1, 1))) model.add(Activation('relu')) model.add(Flatten()) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) processor = AtariProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks! weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(args.env_name) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [FileLogger(log_filename, interval=100)] dqn.fit(env, callbacks=callbacks, nb_steps=timesteps, log_interval=10000) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True) # Finally, evaluate our algorithm for 10 episodes. result = dqn.test(env, nb_episodes=10, visualize=False) elif args.mode == 'test': weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) if args.weights: weights_filename = args.weights dqn.load_weights(weights_filename) result = dqn.test(env, nb_episodes=10, visualize=False) return result
# is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, window_length=WINDOW_LENGTH, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, delta_range=(-1., 1.), reward_range=(-1., 1.), target_model_update=10000, train_interval=4) dqn.compile(Adam(lr=.00025), metrics=['mae']) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that you can the built-in Keras callbacks! weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(args.env_name) callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)] callbacks += [FileLogger(log_filename, interval=100)] dqn.fit(env, callbacks=callbacks, nb_steps=1750000, log_interval=10000) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True) # Finally, evaluate our algorithm for 10 episodes. dqn.test(env, nb_episodes=10, visualize=False) elif args.mode == 'test': weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) if args.weights: weights_filename = args.weights dqn.load_weights(weights_filename) dqn.test(env, nb_episodes=10, visualize=True)
nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=train_policy, test_policy=test_policy) filename = 'weights/duel_dqn_{}_weights_{}_{}_{}.h5f'.format( ENV_NAME, LAYER_SIZE, NUM_STEPS, TRIAL_ID) else: dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=train_policy, test_policy=test_policy) filename = 'weights/dqn_{}_weights_{}_{}_{}.h5f'.format( ENV_NAME, LAYER_SIZE, NUM_STEPS, TRIAL_ID) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Load the model weights dqn.load_weights(FILENAME) # Finally, evaluate our algorithm for 1 episode. dqn.test(env, nb_episodes=1, visualize=True, nb_max_episode_steps=500)
from keras.optimizers import Adam import gym from rl.agents.dqn import DQNAgent from rl.policy import EpsGreedyQPolicy from rl.memory import SequentialMemory env = gym.make('MountainCar-v0') nb_actions = env.action_space.n model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) memory = SequentialMemory(limit=30000, window_length=1) policy = EpsGreedyQPolicy(eps=0.001) dqn = DQNAgent(model=model, nb_actions=nb_actions,gamma=0.99, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) history = dqn.fit(env, nb_steps=30000, visualize=False, verbose=2) dqn.test(env, nb_episodes=1, visualize=True)
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2, nb_max_episode_steps=300) import rl.callbacks class EpisodeLogger(rl.callbacks.Callback): def __init__(self): self.observations = {} self.rewards = {} self.actions = {} def on_episode_begin(self, episode, logs): self.observations[episode] = [] self.rewards[episode] = [] self.actions[episode] = [] def on_step_end(self, step, logs): episode = logs['episode'] self.observations[episode].append(logs['observation']) self.rewards[episode].append(logs['reward']) self.actions[episode].append(logs['action']) cb_ep = EpisodeLogger() dqn.test(env, nb_episodes=10, visualize=False, callbacks=[cb_ep]) import matplotlib.pyplot as plt for obs in cb_ep.observations.values(): plt.plot([o[0] for o in obs]) plt.xlabel("step") plt.ylabel("pos")