def main(): model = Sequential() model.add(Flatten(input_shape=(1, 7))) model.add(Dense(units=20, activation='relu')) model.add(Dense(units=20, activation='relu')) model.add(Dense(units=6, activation='linear')) logger.info(model.summary()) steps = 1E9 interval = steps // 100 # policy = MyPolicy() policy = BoltzmannQPolicy() agent = SARSAAgent(model=model, nb_actions=6, policy=policy, train_interval=10, nb_steps_warmup=10) adam = Adam() sgd = SGD(lr=1e-3, momentum=0, decay=0, nesterov=False) agent.compile(optimizer=adam, metrics=['mse']) env = MyEnv() agent.fit(env, steps, verbose=2, visualize=True) fp = Path(__file__).resolve().parent / 'sarsa_weights.h5f' agent.save_weights(fp, overwrite=True) logger.info('Done')
def run_sarsa(): global N_NODE_NETWORK env = SnakeGymDiscrete() nb_actions = env.action_space.n # initialize randomness np.random.seed(123) env.seed(123) # create model model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(N_NODE_NETWORK)) model.add(Activation('relu')) model.add(Dense(N_NODE_NETWORK)) model.add(Activation('relu')) model.add(Dense(N_NODE_NETWORK)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2) sarsa.save_weights('sarsa_SnakeGymDiscrete_weights.h5f', overwrite=True) sarsa.test(env, nb_episodes=5, visualize=True)
def main(): # binance = DataReader() env = BinanceEnv() # binance.get_recent_trades() # env.next_observation() # binance_market = BinanceMarket() # binance_market.long() # time.sleep(3) # binance_market.close_long() # time.sleep(3) # binance_market.short() # time.sleep(3) # binance_market.close_short() # binance_market.update_positions() # print(binance_market.balance) # episodes = 10 # for episode in range(1, episodes + 1): # # At each begining reset the game # state = env.reset() # # set done to False # done = False # # set score to 0 # score = 0 # # while the game is not finished # while not done: # # visualize each step # env.render() # # choose a random action # action = random.randint(0, 5) # # execute the action # n_state, reward, done, info = env.step(action) # # keep track of rewards # score += reward # print('episode {} score {}'.format(episode, score)) model = agent(env.observation_space.shape[0], env.action_space.n) policy = EpsGreedyQPolicy() sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n) sarsa.compile('adam', metrics=['mse', 'accuracy']) # sarsa.load_weights('sarsa_weights_bnb_07.h5f') env.is_testing = False sarsa.fit(env, nb_steps=100000, visualize=False, verbose=1) sarsa.save_weights('sarsa_weights_bnb_07_1.h5f', overwrite=True) # sarsa.load_weights('sarsa_weights_bnb_07_1.h5f') # env.simulator = False env.is_testing = True scores = sarsa.test(env, nb_episodes=1, visualize=False) print('Average score over 100 test games:{}'.format(np.mean(scores.history['episode_reward']))) _ = sarsa.test(env, nb_episodes=10, visualize=True) obs = env.reset() for i in range(2000): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render()
def run_sarsa_agent(driver, queries, candidate_indices, tuning_config): # Get the environment and extract the number of actions. env = gym.make("udo_optimization-v0", driver=driver, queries=queries, candidate_indices=candidate_indices, config=tuning_config) env.horizon = tuning_config['horizon'] nb_actions = env.action_space.n logging.info(f"nr action: {nb_actions}") logging.info(f"observation space: {env.observation_space.shape}") # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(52)) model.add(Activation('relu')) model.add(Dense(252)) model.add(Activation('relu')) model.add(Dense(526)) model.add(Activation('relu')) model.add(Dense(252)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) logging.info(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() # policy.select_action() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. sarsa.fit(env, nb_steps=500, visualize=False, verbose=2) # After training is done, we save the final weights. # sarsa.save_weights('sarsa_{}_weights.h5f'.format(udo_optimization-v0), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=False) env.print_state_summary(env.best_state)
def test_sarsa(): env = TwoRoundDeterministicRewardEnv() np.random.seed(123) env.seed(123) random.seed(123) nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Dense(16, input_shape=(1,))) model.add(Activation('relu')) model.add(Dense(nb_actions, activation='linear')) policy = EpsGreedyQPolicy(eps=.1) sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=50, policy=policy) sarsa.compile(Adam(lr=1e-3)) sarsa.fit(env, nb_steps=20000, visualize=False, verbose=0) policy.eps = 0. h = sarsa.test(env, nb_episodes=20, visualize=False) assert_allclose(np.mean(h.history['episode_reward']), 3.)
class DistopiaSARSA: def __init__(self, env_name='distopia-initial4-v0', in_path=None, out_path=None, terminate_on_fail=False, reconstruct=False): self.ENV_NAME = env_name self.filename = self.ENV_NAME self.init_paths(in_path, out_path) self.init_env(terminate_on_fail) self.init_model(reconstruct) self.compile_agent() def init_paths(self, in_path, out_path): self.in_path = in_path #if self.in_path != None else './' self.out_path = out_path if out_path != None else './' self.log_path = "./logs/{}".format(time.time()) os.mkdir(self.log_path) def init_env(self, terminate_on_fail): self.env = gym.make(self.ENV_NAME) self.env.terminate_on_fail = terminate_on_fail self.env.record_path = "{}/ep_".format(self.log_path) self.env = gym.wrappers.Monitor(self.env, "recording", force=True) np.random.seed(234) self.env.seed(234) self.nb_actions = np.sum(self.env.action_space.nvec) self.num_actions = self.env.NUM_DIRECTIONS self.num_blocks = self.env.NUM_DISTRICTS * self.env.BLOCKS_PER_DISTRICT def init_model(self, reconstruct=False): if self.in_path != None: if reconstruct == True: self.construct_model() else: yaml_file = open( "{}/{}.yaml".format(self.in_path, self.filename), 'r') model_yaml = yaml_file.read() yaml_file.close() self.model = model_from_yaml(model_yaml) self.model.load_weights("{}/{}.h5".format(self.in_path, self.filename)) else: # Next, we build a very simple model. self.construct_model() self.save_model() print(self.model.summary()) def construct_model(self): self.model = Sequential() self.model.add( Flatten(input_shape=(1, ) + self.env.observation_space.shape)) self.model.add(Dense(64)) self.model.add(Activation('relu')) self.model.add(Dense(64)) self.model.add(Activation('relu')) # self.model.add(Dense(16)) # self.model.add(Activation('relu')) self.model.add(Dense(self.nb_actions)) self.model.add(Activation('linear')) def save_model(self): if self.out_path != None: with open(self.filename + ".yaml", 'w+') as yaml_file: yaml_file.write(self.model.to_yaml()) self.model.save_weights('{}/{}.h5'.format(self.out_path, self.ENV_NAME)) def compile_agent(self): # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! processor = DistopiaProcessor(self.num_blocks, self.num_actions) #memory = SequentialMemory(limit=50000, window_length=1) #policy = PatchedBoltzmannQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks) #test_policy = PatchedGreedyQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks) policy = BoltzmannQPolicy() test_policy = GreedyQPolicy() self.sarsa = SARSAAgent(model=self.model, processor=processor, nb_actions=self.nb_actions, nb_steps_warmup=1000, policy=policy, test_policy=test_policy, gamma=0.9) self.sarsa.compile(Adam(lr=1e-3), metrics=['mae']) def train(self, max_steps=100, episodes=100): # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. self.env._max_steps = max_steps #for i in range(episodes): self.env.current_step = 0 n_steps = max_steps * episodes logger = FileLogger( filepath='{}/{}.json'.format(self.out_path, self.ENV_NAME)) self.sarsa.fit(self.env, nb_steps=n_steps, nb_max_episode_steps=max_steps, visualize=False, verbose=1, callbacks=[logger]) #self.env.reset() # After episode is done, we save the final weights. self.sarsa.save_weights('{}/{}.h5'.format(self.out_path, self.ENV_NAME), overwrite=True) def test(self): # Finally, evaluate our algorithm for 5 episodes. self.sarsa.test(self.env, nb_episodes=5, nb_max_start_steps=0, visualize=True)
# Here we can change the number of neural network layers model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Configuration and compilation of the agent policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) # Agent training sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2, callbacks=[WandbLogger()]) # Saving of the final weights sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Testing the algorithm for 5 episodes sarsa.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
policy = BoltzmannQPolicy() #policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, # nb_steps=10000) if args.use_sarsa: # SARSA does not require a memory. agent = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) else: memory = SequentialMemory(limit=50000, window_length=1) agent = DQNAgent(model=model, memory=memory, nb_actions=nb_actions, nb_steps_warmup=50, policy=policy) agent.compile(Adam(lr=args.learning_rate), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent.fit(env, nb_steps=args.n_steps, visualize=False, verbose=2) # After training is done, we save the final weights. #sarsa.save_weights('sarsa_osc_weights.h5f', overwrite=True) # Finally, evaluate our algorithm for 5 episodes. #sarsa.test(env, nb_episodes=5, visualize=True)
def train(): # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() # processor_noisy = CartpoleSurrogateProcessor(e_= ERR_N, e=ERR_P, surrogate=False) # processor_surrogate = CartpoleSurrogateProcessor(e_= ERR_N, e=ERR_P, surrogate=True) if not SMOOTH: processor_noisy = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=False, surrogate=False) processor_surrogate = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=False, surrogate=True) else: processor_noisy = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=True, surrogate=False) processor_surrogate = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=True, surrogate=True) if REWARD == "normal": sarsa_normal = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa_normal.compile(Adam(lr=1e-3), metrics=['mae']) history_normal = sarsa_normal.fit(env, nb_steps=50000, visualize=False, verbose=2) sarsa_normal.save_weights(os.path.join(LOG_DIR, 'sarsa_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) sarsa_normal.test(env, nb_episodes=10, visualize=False, verbose=2) pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv")) elif REWARD == "noisy": sarsa_noisy = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy, processor=processor_noisy) sarsa_noisy.compile(Adam(lr=1e-3), metrics=['mae']) history_noisy = sarsa_noisy.fit(env, nb_steps=50000, visualize=False, verbose=2) if not SMOOTH: sarsa_noisy.save_weights(os.path.join(LOG_DIR, 'sarsa_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv")) else: sarsa_noisy.save_weights(os.path.join(LOG_DIR, 'sarsa_noisy_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv")) sarsa_noisy.test(env, nb_episodes=10, visualize=False) elif REWARD == "surrogate": sarsa_surrogate = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy, processor=processor_surrogate) sarsa_surrogate.compile(Adam(lr=1e-3), metrics=['mae']) history_surrogate = sarsa_surrogate.fit(env, nb_steps=50000, visualize=False, verbose=2) if not SMOOTH: sarsa_surrogate.save_weights(os.path.join(LOG_DIR, 'sarsa_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv")) else: sarsa_surrogate.save_weights(os.path.join(LOG_DIR, 'sarsa_surrogate_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv")) sarsa_surrogate.test(env, nb_episodes=10, visualize=False)
y = Dense(24)(y) y = Activation('relu')(y) y = Dense(24)(y) y = Activation('relu')(y) y = Dense(nb_actions)(y) y = Activation('linear')(y) model = Model(x, y) policy = EpsGreedyQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10000, policy=policy, gamma=.85) sarsa.compile(Adam(lr=.3, decay=.001), metrics=['mae']) rewards = [] hist = sarsa.fit(env, nb_steps=100000, visualize=False, verbose=2) rewards.extend(hist.history.get('episode_reward')) plt.plot(rewards) sarsa.test(env, nb_episodes=5, visualize=True) state = env.reset() action = env.action_space.sample() print(action) for i in range(300): # action = np.argmax(sarsa.model.predict(np.expand_dims(np.expand_dims(state, 0), 0))[0]) state, reward, done, _ = env.step(action) env.render() env.render(close=True)
model_folder = './models/' + app_name + '/' model_file = model_folder + app_name + '.h5' try: # Load the model if it already exists. print('Loading existing model...') model = load_model(model_file) print('Model loaded.') except OSError: # Build it from scratch if it doesn't. model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(64, activation='relu', use_bias=True, name='dense1')) model.add(Dense(64, activation='relu', use_bias=True, name='dense2')) model.add(Dense(64, activation='relu', use_bias=True, name='dense3')) model.add(Dense(nb_actions, activation='linear', name='readout')) print(model.summary()) policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) if fit: sarsa.fit(env, nb_steps=50000, visualize=visualize_fit, verbose=2) pathlib.Path(model_folder).mkdir(parents=True, exist_ok=True) model.save(model_file) if test: sarsa.test(env, nb_episodes=5, visualize=visualize_test)
model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2) # After training is done, we save the final weights. sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=10, visualize=True)
class DQN: def __init__( self, env="CartPole-v1", emulateOculus=True, visualize=True, teachingFilesPath=None, policyValues={ "inner_policy": EpsGreedyQPolicy(), "attr": "eps", "value_max": 0.75, "value_min": .01, "value_test": .0, "nb_steps": 50000 }, dobotEmulation=False): self.policyValues = policyValues os.environ[ "PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' physical_devices = tf.config.experimental.list_physical_devices('GPU') print("physical_devices-------------", len(physical_devices)) tf.config.experimental.set_memory_growth(physical_devices[0], True) self.episodeLength = 25 if env == "CartPole-v1": self.env = gym.make('CartPole-v1') self.states = self.env.observation_space.shape[0] self.actions = self.env.action_space.n self.saveFileName = 'sarsa_weights.h5f' logdir = "logs/CartPoleV1/" + datetime.now().strftime( "%Y%m%d-%H%M%S") self.tensorboard_callback = keras.callbacks.TensorBoard( log_dir=logdir) self.visualize = True elif env == "Dobot": self.env = dobotGym.dobotGym(emulateOculus=emulateOculus, episodeLength=self.episodeLength, visualize=visualize, teachingFilesPath=teachingFilesPath, dobotEmulation=dobotEmulation) self.states = self.env.observation_space.shape[0] self.actions = self.env.action_space.shape[0] self.saveFileName = 'sarsa_weights_dobot.h5f' logdir = "logs/Dobot/" + datetime.now().strftime("%Y%m%d-%H%M%S") self.tensorboard_callback = keras.callbacks.TensorBoard( log_dir=logdir) self.visualize = True else: raise TypeError("Wrong env") print( 'States', self.states ) # To get an idea about the number of variables affecting the environment print( 'Actions', self.actions ) # To get an idea about the number of possible actions in the environment, do [right,left] # # episodes = 10 # for episode in range(1, episodes + 1): # # At each begining reset the game # state = self.env.reset() # # set done to False # done = False # # set score to 0 # score = 0 # # while the game is not finished # while not done: # # visualize each step # self.env.render() # # choose a random action # action = random.choice([0, 1]) # # execute the action # n_state, reward, done, info = self.env.step(action) # # keep track of rewards # score += reward # print('episode {} score {}'.format(episode, score)) # not working :( # self.agent = self.agentDDP(self.states, self.actions) # self.agent = self.NAFAgent(self.states, self.actions) # self.policy = EpsGreedyQPolicy() self.savingFreq = 100 self.actualSaving = 0 self.model = self.agentSarsa(self.states, self.actions) self.policy = LinearAnnealedPolicy( inner_policy=self.policyValues["inner_policy"], attr=self.policyValues["attr"], value_max=self.policyValues["value_max"], value_min=self.policyValues["value_min"], value_test=self.policyValues["value_test"], nb_steps=self.policyValues["nb_steps"]) self.agent = SARSAAgent(model=self.model, policy=self.policy, nb_actions=self.actions) self.agent._is_graph_network = True def t(): return False self.agent._in_multi_worker_mode = t self.agent.save = self.saveAgentWeights def lenmeh(): return self.actions # self.agent.__len__ = lenmeh def saveAgentWeights(self, path, overwrite=True): if self.actualSaving < self.savingFreq: self.actualSaving += 1 return None else: self.actualSaving = 0 path = 'model/checkpoint/' + datetime.now().strftime( "%Y%m%d-%H%M%S") + self.saveFileName self.agent.save_weights(path, overwrite) def agentSarsa(self, states, actions): self.model = Sequential() self.model.add(LSTM(42, activation='sigmoid', input_shape=(1, states))) self.model.add(Dense(42, activation='sigmoid')) self.model.add(Dense(42, activation='sigmoid')) self.model.add(Dense(24, activation='sigmoid')) self.model.add(Dense(12, activation='sigmoid')) self.model.add(Dense(actions, activation='linear')) self.path = fileOperation.saveToFolder(self.model.to_json(), name='modelShape', folder="model\\checkpoint") # , stateful=False states are resetted together after each batch. # model.add(Flatten(input_shape=(1, states))) # dot_img_file = '/model_1.png' # keras.utils.plot_model(self.model, to_file=dot_img_file, show_shapes=True) # model.reset_states() return self.model def load(self): path = fileOperation.openDialogFunction(".h5f") self.agent.compile('adam', metrics=['mse']) self.agent.load_weights(path) self.agent.compile('adam', metrics=['mse']) def test(self, nb_episodes=2): _ = self.agent.test(self.env, nb_episodes=nb_episodes, visualize=self.visualize) def fit(self, visualize=False): checkpoint_filepath = 'model/checkpoint/' model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_filepath, save_weights_only=False, save_freq=25) self.agent.compile('adam', metrics=['mse']) self.agent.fit( self.env, nb_steps=self.policyValues["nb_steps"], log_interval=self.episodeLength, visualize=visualize, verbose=1, nb_max_start_steps=1, start_step_policy=self.model.reset_states, # callbacks=[PlotLossesKeras()]) callbacks=[self.tensorboard_callback, model_checkpoint_callback], ) scores = self.agent.test(self.env, nb_episodes=5, visualize=visualize) print('Average score over 5 test games:{}'.format( np.mean(scores.history['episode_reward'])))
class KerasSarsaAgent(AbstractAgent): def __init__(self, env, timesteps_per_episode=10001): super().__init__(env, timesteps_per_episode) self.num_episodes = 400 self.evaluating = False self.action_size = env.action_space.n self.state_size = env.num_states self.model = self._build_compile_model() self.agent = SARSAAgent(model=self.model, nb_actions=self.action_size, policy=EpsGreedyQPolicy()) def run(self) -> {str: float}: """ The agent's training method. Returns: a dictionary - {"episode_reward_mean": __, "episode_reward_min": __, "episode_reward_max": __, "episode_len_mean": __} """ self.agent.compile(Adam(lr=0.001), metrics=["mse"]) history = self.agent.fit(self.env, nb_steps=ITER_NUM, visualize=False, verbose=1) if len(history.history) > 0: episode_reward = history.history["episode_reward"] nb_episode_steps = history.history["nb_episode_steps"] else: episode_reward, nb_episode_steps = [0], [0] # TODO - placeholder result = { EPISODE_REWARD_MEAN: np.array(episode_reward), EPISODE_STEP_NUM_MEAN: np.array(nb_episode_steps), EPISODE_REWARD_MIN: np.empty([]), EPISODE_REWARD_MAX: np.empty([]), EPISODE_VARIANCE: np.empty([]) } return result def _build_compile_model(self): model = Sequential() # model.add(Flatten(input_shape=(1, self.action_size))) model.add(Embedding(self.state_size, 10, input_length=1)) # 600000 model.add(Reshape((10, ))) model.add(Dense(24, activation='relu')) model.add(Dense(16, activation='relu')) model.add(Dense(16, activation='relu')) model.add(Dense(self.action_size, activation='linear')) return model def compute_action(self, state) -> int: """ Computes the best action from a given state. Returns: a int that represents the best action. """ state = np.array([[state]]) return int(np.argmax(self.model.predict(state))) def stop_episode(self): pass def episode_callback(self, state, action, reward, next_state, terminated): pass def evaluate(self, visualize=False): self.agent.test(self.env, nb_episodes=5, visualize=visualize, nb_max_episode_steps=60) def replay_experiences(self): pass
histogram_freq=0, batch_size=100, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, embeddings_data=None, update_freq='batch') tb = TensorBoard(log_dir='./logs/log_{}'.format(filename)) hist = sarsa.fit(env, nb_steps=100000, visualize=False, verbose=2, nb_max_episode_steps=500, callbacks=[tb]) # 20s episodes # print history print("history contents : ", hist.history.keys()) # episode_reward, nb_episode_steps, nb_steps # summarize history for accuracy import matplotlib.pyplot as plt plt.plot(hist.history['episode_reward']) plt.plot(hist.history['nb_episode_steps']) plt.title('learning') plt.xlabel('episode') plt.legend(['episode_reward', 'nb_episode_steps'], loc='upper left') plt.show()
#%% from rl.agents import SARSAAgent from rl.policy import EpsGreedyQPolicy policy = EpsGreedyQPolicy() sarsa = SARSAAgent( model=model, policy=policy, nb_actions=env.action_space.n # from env ! ) sarsa.compile('adam', metrics=['mse']) # just model.compile(...) sarsa.fit(env, nb_steps=5e4, visualize=False, verbose=1) #%% scores = sarsa.test(env, nb_episodes=100, visualize=False) mean_score = np.mean(scores.history['episode_reward']) print('Average score over 100 test games: {}'.format(mean_score)) #%% sarsa.save_weights('sarsa_weights.h5f', overwrite=True) #%% sarsa.load_weights('sarsa_weights.h5f') #%% # how the trained agent works
nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2) # After training is done, we save the final weights. sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=True)
nb_actions=nb_actions, nb_steps_warmup=1000, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) # compiling the model sarsa.compile(Adam(lr=lrn_rate), metrics=['mae']) # setting up callbacks for result collection and realtime visualization of the results through tensorboard tensorboard = TensorBoard(log_dir="logs/{}".format(time())) tpl = TrainEpisodeLogger() # finally perform the training----- visualize=False enables training without visualizing the game which speeds up the training process sarsa.fit(env, nb_steps=nb_steps, visualize=False, verbose=2, callbacks=[tensorboard, tpl], nb_max_episode_steps=nb_max_episode_steps) # save the model weights sarsa.save_weights('sarsa_%d_%s_weights.h5f' % (scale, ENV_NAME), overwrite=True) # save the training results metrics = [] def dict_to_list(dc): re = [] for key in dc: re.append(dc[key])
model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. sarsa.fit(env, nb_steps=args.EPISODES, visualize=False, verbose=2) # After training is done, we save the final weights. sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=True)
kernel_initializer=weight_initializer)(hiddenLayer) outputLayer = Dense(nb_actions, activation='linear')(hiddenLayer) model = Model(inputLayer, outputLayer) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) if loadFromExisting: sarsa.load_weights(file_path) else: startTime = time.time() sarsa.fit(env, nb_steps=nSteps, visualize=True, verbose=1) endTime = time.time() sarsa.save_weights(file_path, overwrite=True) # After training is done, we save the final weights. # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=True) if not loadFromExisting: print("Time taken to trian: {0}".format(endTime - startTime))
nb_steps_warmup=10, policy=policy, test_policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) # Load weights try: #dqn.load_weights(weights_filename) sarsa.load_weights(weights_filename) except OSError: print("no saved weights found") # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. #dqn.fit(env, nb_steps=5000000, visualize=False, verbose=2) sarsa.fit(env, nb_steps=50000, visualize=False, verbose=1, callbacks=[WandbCallback()]) # After training is done, we save the final weights. #dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. #dqn.test(env, nb_episodes=5, visualize=True) sarsa.test(env, nb_episodes=5, visualize=True) # Save weights #dqn.save_weights(weights_filename, overwrite=True) sarsa.save_weights(weights_filename, overwrite=True)