def create(env): np.random.seed(config.current.domain_seed) env.seed(config.current.domain_seed) nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(config.current.agent_vfn_complexity)) model.add(Activation('relu')) model.add(Dense(config.current.agent_vfn_complexity)) model.add(Activation('relu')) model.add(Dense(config.current.agent_vfn_complexity)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) global graph graph = tf.get_default_graph() # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) return sarsa
def init_sarsa(env, nb_actions, lr=1e-3): """ Initialize the Sarsa agent using the keras-rl package. :param env: the environment to be played, required to determine the input size :param nb_actions: number of actions :param lr: learning rate :return: Sarsa Agent """ # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dense(256)) model.add(Activation('relu')) model.add(Dense(64)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.model_name = f"SARSA" sarsa.compile(Adam(lr=lr), metrics=['mae']) return sarsa
def main(): model = Sequential() model.add(Flatten(input_shape=(1, 7))) model.add(Dense(units=20, activation='relu')) model.add(Dense(units=20, activation='relu')) model.add(Dense(units=6, activation='linear')) logger.info(model.summary()) steps = 1E9 interval = steps // 100 # policy = MyPolicy() policy = BoltzmannQPolicy() agent = SARSAAgent(model=model, nb_actions=6, policy=policy, train_interval=10, nb_steps_warmup=10) adam = Adam() sgd = SGD(lr=1e-3, momentum=0, decay=0, nesterov=False) agent.compile(optimizer=adam, metrics=['mse']) env = MyEnv() agent.fit(env, steps, verbose=2, visualize=True) fp = Path(__file__).resolve().parent / 'sarsa_weights.h5f' agent.save_weights(fp, overwrite=True) logger.info('Done')
def run_sarsa(): global N_NODE_NETWORK env = SnakeGymDiscrete() nb_actions = env.action_space.n # initialize randomness np.random.seed(123) env.seed(123) # create model model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(N_NODE_NETWORK)) model.add(Activation('relu')) model.add(Dense(N_NODE_NETWORK)) model.add(Activation('relu')) model.add(Dense(N_NODE_NETWORK)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2) sarsa.save_weights('sarsa_SnakeGymDiscrete_weights.h5f', overwrite=True) sarsa.test(env, nb_episodes=5, visualize=True)
def create_sarsa_agent(env): env = create_environment() model = create_deep_model(env) nb_actions = env.action_space.n policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) return sarsa
def main(): # binance = DataReader() env = BinanceEnv() # binance.get_recent_trades() # env.next_observation() # binance_market = BinanceMarket() # binance_market.long() # time.sleep(3) # binance_market.close_long() # time.sleep(3) # binance_market.short() # time.sleep(3) # binance_market.close_short() # binance_market.update_positions() # print(binance_market.balance) # episodes = 10 # for episode in range(1, episodes + 1): # # At each begining reset the game # state = env.reset() # # set done to False # done = False # # set score to 0 # score = 0 # # while the game is not finished # while not done: # # visualize each step # env.render() # # choose a random action # action = random.randint(0, 5) # # execute the action # n_state, reward, done, info = env.step(action) # # keep track of rewards # score += reward # print('episode {} score {}'.format(episode, score)) model = agent(env.observation_space.shape[0], env.action_space.n) policy = EpsGreedyQPolicy() sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n) sarsa.compile('adam', metrics=['mse', 'accuracy']) # sarsa.load_weights('sarsa_weights_bnb_07.h5f') env.is_testing = False sarsa.fit(env, nb_steps=100000, visualize=False, verbose=1) sarsa.save_weights('sarsa_weights_bnb_07_1.h5f', overwrite=True) # sarsa.load_weights('sarsa_weights_bnb_07_1.h5f') # env.simulator = False env.is_testing = True scores = sarsa.test(env, nb_episodes=1, visualize=False) print('Average score over 100 test games:{}'.format(np.mean(scores.history['episode_reward']))) _ = sarsa.test(env, nb_episodes=10, visualize=True) obs = env.reset() for i in range(2000): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render()
class SarsaAgent(Agent): def __init__(self, state_dim, action_space, epsilon, gamma, lr): self._model = self._get_model(state_dim, action_space) self.agent = SARSAAgent(self._model, nb_actions=action_space, gamma=gamma, policy=EpsGreedyQPolicy(epsilon), test_policy=EpsGreedyQPolicy(eps=0.01)) self.agent.compile(Adam(lr)) def model_summary(self): print(self._model.summary())
def main(): # nb_actions = cpst._action_space nb_actions = 2 # Next, we build a very simple model. model = Sequential() #n_os = cpst._observation_space.shape n_os = 4 model.add(Flatten(input_shape=[1] + [n_os])) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) model._make_predict_function() # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) cart_pole = CartPole(name='cp') log = logging.getLogger('bact2') RE = RunEngine({}) RE.log.setLevel('DEBUG') cart_pole.log = RE.log stm = [cart_pole.x, cart_pole.x_dot, cart_pole.theta, cart_pole.theta_dot] cpst = CartPoleEnv(detectors=[cart_pole], motors=[cart_pole], state_motors=stm, user_kwargs={'mode_var': cart_pole.rl_mode}) np.random.seed(123) cpst.seed(123) partial = functools.partial(run_test, sarsa, cpst, log=RE.log) RE(run_environement(cpst, partial, log=RE.log))
def main(): # with ServerProxy("http://127.0.0.1:8000/", verbose=False, allow_none=True) as proxy: if True: pass #D:\Devel\github\keras-rl;D:\Devel\github\Devel\hz-b\naus # set PYTHONPATH=D:\Devel\github\keras-rl;D:\Devel\github\Devel\hz-b\naus # & python d:\Devel\github\Devel\hz-b\naus\examples\rl\cart_pole\sarsa_cartpole.py def stop_my_application(): print('Stopping application') with allow_interrupt(): # main polling loop. env = EnvironmentProxyForClient(receiver=None) np.random.seed(1974) env.seed(1974) env.reset() # nb_actions = cpst._action_space nb_actions = 2 # Next, we build a very simple model. model = Sequential() #n_os = cpst._observation_space.shape n_os = 4 model.add(Flatten(input_shape=[1] +[n_os])) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) run_test(sarsa, env, log=log)
def run_sarsa_agent(driver, queries, candidate_indices, tuning_config): # Get the environment and extract the number of actions. env = gym.make("udo_optimization-v0", driver=driver, queries=queries, candidate_indices=candidate_indices, config=tuning_config) env.horizon = tuning_config['horizon'] nb_actions = env.action_space.n logging.info(f"nr action: {nb_actions}") logging.info(f"observation space: {env.observation_space.shape}") # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(52)) model.add(Activation('relu')) model.add(Dense(252)) model.add(Activation('relu')) model.add(Dense(526)) model.add(Activation('relu')) model.add(Dense(252)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) logging.info(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() # policy.select_action() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. sarsa.fit(env, nb_steps=500, visualize=False, verbose=2) # After training is done, we save the final weights. # sarsa.save_weights('sarsa_{}_weights.h5f'.format(udo_optimization-v0), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=False) env.print_state_summary(env.best_state)
def test_sarsa(): env = TwoRoundDeterministicRewardEnv() np.random.seed(123) env.seed(123) random.seed(123) nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Dense(16, input_shape=(1,))) model.add(Activation('relu')) model.add(Dense(nb_actions, activation='linear')) policy = EpsGreedyQPolicy(eps=.1) sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=50, policy=policy) sarsa.compile(Adam(lr=1e-3)) sarsa.fit(env, nb_steps=20000, visualize=False, verbose=0) policy.eps = 0. h = sarsa.test(env, nb_episodes=20, visualize=False) assert_allclose(np.mean(h.history['episode_reward']), 3.)
def get_agent(agent_type, model_type, lr): if agent_type == "sarsa": policy = BoltzmannQPolicy() model = get_model(model_type) agent = SARSAAgent(model=model, policy=policy, nb_actions=nb_actions, nb_steps_warmup=10, gamma=0.99) agent.compile(Adam(lr), metrics=['mae']) return agent elif agent_type == "dqn": policy = BoltzmannQPolicy() model = get_model(model_type) memory = SequentialMemory(limit=50000, window_length=1) agent = DQNAgent(model=model, policy=policy, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, enable_double_dqn=True) agent.compile(Adam(lr), metrics=['mae']) return agent elif agent_type == "a2c": agent = A2CAgent(nb_actions, len(env.observation_space.high), nb_steps_warmup=10, actor_lr=0.001, critic_lr=0.005) agent.compile(Adam(lr)) return agent elif agent_type == "ppo": pass else: print("Unsupported model") exit(1)
# print(inverse_model.summary()) # predicts future state from current state and action forward_model = build_forward_model(fmap, nb_actions) forward_model.compile(Adam(lr=1e-3), loss='mse', metrics=['mse']) # print(forward_model.summary()) model = build_actor_model((1, ) + observation_shape, nb_actions) # print(model.summary()) policy = BoltzmannQPolicy() agent = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=3, policy=policy) agent.compile(Adam(lr=1e-3), metrics=['mae']) agent.reset_states() #=========================================================================# # re-use weights if possible if (os.path.isfile(inv_weights_fname)): inverse_model.load_weights(inv_weights_fname) if (os.path.isfile(fwd_weights_fname)): forward_model.load_weights(fwd_weights_fname) if (os.path.isfile(agent_weights_fname)): agent.load_weights(agent_weights_fname) # else: # FIXME: this bit is necessary or agent does nothing???
y = Activation('relu')(y) y = Dense(24)(y) y = Activation('relu')(y) y = Dense(24)(y) y = Activation('relu')(y) y = Dense(nb_actions)(y) y = Activation('linear')(y) model = Model(x, y) policy = EpsGreedyQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10000, policy=policy, gamma=.85) sarsa.compile(Adam(lr=.3, decay=.001), metrics=['mae']) rewards = [] hist = sarsa.fit(env, nb_steps=100000, visualize=False, verbose=2) rewards.extend(hist.history.get('episode_reward')) plt.plot(rewards) sarsa.test(env, nb_episodes=5, visualize=True) state = env.reset() action = env.action_space.sample() print(action) for i in range(300): # action = np.argmax(sarsa.model.predict(np.expand_dims(np.expand_dims(state, 0), 0))[0]) state, reward, done, _ = env.step(action) env.render()
class DistopiaSARSA: def __init__(self, env_name='distopia-initial4-v0', in_path=None, out_path=None, terminate_on_fail=False, reconstruct=False): self.ENV_NAME = env_name self.filename = self.ENV_NAME self.init_paths(in_path, out_path) self.init_env(terminate_on_fail) self.init_model(reconstruct) self.compile_agent() def init_paths(self, in_path, out_path): self.in_path = in_path #if self.in_path != None else './' self.out_path = out_path if out_path != None else './' self.log_path = "./logs/{}".format(time.time()) os.mkdir(self.log_path) def init_env(self, terminate_on_fail): self.env = gym.make(self.ENV_NAME) self.env.terminate_on_fail = terminate_on_fail self.env.record_path = "{}/ep_".format(self.log_path) self.env = gym.wrappers.Monitor(self.env, "recording", force=True) np.random.seed(234) self.env.seed(234) self.nb_actions = np.sum(self.env.action_space.nvec) self.num_actions = self.env.NUM_DIRECTIONS self.num_blocks = self.env.NUM_DISTRICTS * self.env.BLOCKS_PER_DISTRICT def init_model(self, reconstruct=False): if self.in_path != None: if reconstruct == True: self.construct_model() else: yaml_file = open( "{}/{}.yaml".format(self.in_path, self.filename), 'r') model_yaml = yaml_file.read() yaml_file.close() self.model = model_from_yaml(model_yaml) self.model.load_weights("{}/{}.h5".format(self.in_path, self.filename)) else: # Next, we build a very simple model. self.construct_model() self.save_model() print(self.model.summary()) def construct_model(self): self.model = Sequential() self.model.add( Flatten(input_shape=(1, ) + self.env.observation_space.shape)) self.model.add(Dense(64)) self.model.add(Activation('relu')) self.model.add(Dense(64)) self.model.add(Activation('relu')) # self.model.add(Dense(16)) # self.model.add(Activation('relu')) self.model.add(Dense(self.nb_actions)) self.model.add(Activation('linear')) def save_model(self): if self.out_path != None: with open(self.filename + ".yaml", 'w+') as yaml_file: yaml_file.write(self.model.to_yaml()) self.model.save_weights('{}/{}.h5'.format(self.out_path, self.ENV_NAME)) def compile_agent(self): # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! processor = DistopiaProcessor(self.num_blocks, self.num_actions) #memory = SequentialMemory(limit=50000, window_length=1) #policy = PatchedBoltzmannQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks) #test_policy = PatchedGreedyQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks) policy = BoltzmannQPolicy() test_policy = GreedyQPolicy() self.sarsa = SARSAAgent(model=self.model, processor=processor, nb_actions=self.nb_actions, nb_steps_warmup=1000, policy=policy, test_policy=test_policy, gamma=0.9) self.sarsa.compile(Adam(lr=1e-3), metrics=['mae']) def train(self, max_steps=100, episodes=100): # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. self.env._max_steps = max_steps #for i in range(episodes): self.env.current_step = 0 n_steps = max_steps * episodes logger = FileLogger( filepath='{}/{}.json'.format(self.out_path, self.ENV_NAME)) self.sarsa.fit(self.env, nb_steps=n_steps, nb_max_episode_steps=max_steps, visualize=False, verbose=1, callbacks=[logger]) #self.env.reset() # After episode is done, we save the final weights. self.sarsa.save_weights('{}/{}.h5'.format(self.out_path, self.ENV_NAME), overwrite=True) def test(self): # Finally, evaluate our algorithm for 5 episodes. self.sarsa.test(self.env, nb_episodes=5, nb_max_start_steps=0, visualize=True)
model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(learning_rate=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2) # After training is done, we save the final weights. sarsa.save_weights(f'sarsa_{ENV_NAME}_weights.h5f', overwrite=True) # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=True)
def train(): # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() # processor_noisy = CartpoleSurrogateProcessor(e_= ERR_N, e=ERR_P, surrogate=False) # processor_surrogate = CartpoleSurrogateProcessor(e_= ERR_N, e=ERR_P, surrogate=True) if not SMOOTH: processor_noisy = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=False, surrogate=False) processor_surrogate = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=False, surrogate=True) else: processor_noisy = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=True, surrogate=False) processor_surrogate = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=True, surrogate=True) if REWARD == "normal": sarsa_normal = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa_normal.compile(Adam(lr=1e-3), metrics=['mae']) history_normal = sarsa_normal.fit(env, nb_steps=50000, visualize=False, verbose=2) sarsa_normal.save_weights(os.path.join(LOG_DIR, 'sarsa_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) sarsa_normal.test(env, nb_episodes=10, visualize=False, verbose=2) pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv")) elif REWARD == "noisy": sarsa_noisy = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy, processor=processor_noisy) sarsa_noisy.compile(Adam(lr=1e-3), metrics=['mae']) history_noisy = sarsa_noisy.fit(env, nb_steps=50000, visualize=False, verbose=2) if not SMOOTH: sarsa_noisy.save_weights(os.path.join(LOG_DIR, 'sarsa_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv")) else: sarsa_noisy.save_weights(os.path.join(LOG_DIR, 'sarsa_noisy_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv")) sarsa_noisy.test(env, nb_episodes=10, visualize=False) elif REWARD == "surrogate": sarsa_surrogate = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy, processor=processor_surrogate) sarsa_surrogate.compile(Adam(lr=1e-3), metrics=['mae']) history_surrogate = sarsa_surrogate.fit(env, nb_steps=50000, visualize=False, verbose=2) if not SMOOTH: sarsa_surrogate.save_weights(os.path.join(LOG_DIR, 'sarsa_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv")) else: sarsa_surrogate.save_weights(os.path.join(LOG_DIR, 'sarsa_surrogate_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv")) sarsa_surrogate.test(env, nb_episodes=10, visualize=False)
nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2) # After training is done, we save the final weights. sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=True)
model.summary() #%% from rl.agents import SARSAAgent from rl.policy import EpsGreedyQPolicy policy = EpsGreedyQPolicy() sarsa = SARSAAgent( model=model, policy=policy, nb_actions=env.action_space.n # from env ! ) sarsa.compile('adam', metrics=['mse']) # just model.compile(...) sarsa.fit(env, nb_steps=5e4, visualize=False, verbose=1) #%% scores = sarsa.test(env, nb_episodes=100, visualize=False) mean_score = np.mean(scores.history['episode_reward']) print('Average score over 100 test games: {}'.format(mean_score)) #%% sarsa.save_weights('sarsa_weights.h5f', overwrite=True) #%% sarsa.load_weights('sarsa_weights.h5f') #%%
class KerasSarsaAgent(AbstractAgent): def __init__(self, env, timesteps_per_episode=10001): super().__init__(env, timesteps_per_episode) self.num_episodes = 400 self.evaluating = False self.action_size = env.action_space.n self.state_size = env.num_states self.model = self._build_compile_model() self.agent = SARSAAgent(model=self.model, nb_actions=self.action_size, policy=EpsGreedyQPolicy()) def run(self) -> {str: float}: """ The agent's training method. Returns: a dictionary - {"episode_reward_mean": __, "episode_reward_min": __, "episode_reward_max": __, "episode_len_mean": __} """ self.agent.compile(Adam(lr=0.001), metrics=["mse"]) history = self.agent.fit(self.env, nb_steps=ITER_NUM, visualize=False, verbose=1) if len(history.history) > 0: episode_reward = history.history["episode_reward"] nb_episode_steps = history.history["nb_episode_steps"] else: episode_reward, nb_episode_steps = [0], [0] # TODO - placeholder result = { EPISODE_REWARD_MEAN: np.array(episode_reward), EPISODE_STEP_NUM_MEAN: np.array(nb_episode_steps), EPISODE_REWARD_MIN: np.empty([]), EPISODE_REWARD_MAX: np.empty([]), EPISODE_VARIANCE: np.empty([]) } return result def _build_compile_model(self): model = Sequential() # model.add(Flatten(input_shape=(1, self.action_size))) model.add(Embedding(self.state_size, 10, input_length=1)) # 600000 model.add(Reshape((10, ))) model.add(Dense(24, activation='relu')) model.add(Dense(16, activation='relu')) model.add(Dense(16, activation='relu')) model.add(Dense(self.action_size, activation='linear')) return model def compute_action(self, state) -> int: """ Computes the best action from a given state. Returns: a int that represents the best action. """ state = np.array([[state]]) return int(np.argmax(self.model.predict(state))) def stop_episode(self): pass def episode_callback(self, state, action, reward, next_state, terminated): pass def evaluate(self, visualize=False): self.agent.test(self.env, nb_episodes=5, visualize=visualize, nb_max_episode_steps=60) def replay_experiences(self): pass
model.add(Flatten(input_shape=(1, states))) model.add(Dense(24, activation="relu")) model.add(Dense(24, activation="relu")) model.add(Dense(24, activation="relu")) model.add(Dense(actions, activation="linear")) return model model = agent(env.observation_space.shape[0], env.action_space.n) policy = EpsGreedyQPolicy() sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n) sarsa.compile("adam", metrics=["mse"]) sarsa.fit(env, nb_steps=10000, visualize=False, verbose=1) ready() scores = sarsa.test(env, nb_episodes=5, visualize=True) print('Average score over 100 test games:{}'.format(np.mean(scores.history['episode_reward']))) #sarsa.save_weights('sarsa_weights.h5f', overwrite=True) # save trained weights # sarsa.load_weights('sarsa_weights.h5f') # can be used to load trained weights
# Make a neural net with 3 hidden layers def agent(states, actions): model = Sequential() model.add(Flatten(input_shape=(1, states))) model.add(Dense(24, activation='relu')) model.add(Dense(24, activation='relu')) model.add(Dense(24, activation='relu')) model.add(Dense(actions, activation='linear')) return model # Actually make a neural net with 3 hidden layers model = agent(env.observation_space.shape[0], env.action_space.n) policy = EpsGreedyQPolicy() # Create a tensorflow reinforcement learning agent using the [state > action > reward] system sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n) # Choose how we calculate reward and modify the model sarsa.compile('adam', metrics=['mse']) # sarsa.fit(env, nb_steps = 50000, visualize = False, verbose = 1) sarsa.load_weights('cartpolekerassarsa.h5f') scores = sarsa.test(env, nb_episodes=10, visualize=False) print('Average score over 10 test games: {}'.format( np.mean(scores.history['episode_reward']))) sarsa.save_weights('cartpolekerassarsa.h5f', overwrite=True) sarsa.test(env, nb_episodes=2, visualize=True)
policy = BoltzmannQPolicy() #policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, # nb_steps=10000) if args.use_sarsa: # SARSA does not require a memory. agent = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) else: memory = SequentialMemory(limit=50000, window_length=1) agent = DQNAgent(model=model, memory=memory, nb_actions=nb_actions, nb_steps_warmup=50, policy=policy) agent.compile(Adam(lr=args.learning_rate), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent.fit(env, nb_steps=args.n_steps, visualize=False, verbose=2) # After training is done, we save the final weights. #sarsa.save_weights('sarsa_osc_weights.h5f', overwrite=True) # Finally, evaluate our algorithm for 5 episodes. #sarsa.test(env, nb_episodes=5, visualize=True)
hiddenLayer = Dense(nHiddenLayerNodes, activation='relu', kernel_initializer=weight_initializer)(hiddenLayer) outputLayer = Dense(nb_actions, activation='linear')(hiddenLayer) model = Model(inputLayer, outputLayer) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) if loadFromExisting: sarsa.load_weights(file_path) else: startTime = time.time() sarsa.fit(env, nb_steps=nSteps, visualize=True, verbose=1) endTime = time.time() sarsa.save_weights(file_path, overwrite=True) # After training is done, we save the final weights. # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=True) if not loadFromExisting:
class DQN: def __init__( self, env="CartPole-v1", emulateOculus=True, visualize=True, teachingFilesPath=None, policyValues={ "inner_policy": EpsGreedyQPolicy(), "attr": "eps", "value_max": 0.75, "value_min": .01, "value_test": .0, "nb_steps": 50000 }, dobotEmulation=False): self.policyValues = policyValues os.environ[ "PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' physical_devices = tf.config.experimental.list_physical_devices('GPU') print("physical_devices-------------", len(physical_devices)) tf.config.experimental.set_memory_growth(physical_devices[0], True) self.episodeLength = 25 if env == "CartPole-v1": self.env = gym.make('CartPole-v1') self.states = self.env.observation_space.shape[0] self.actions = self.env.action_space.n self.saveFileName = 'sarsa_weights.h5f' logdir = "logs/CartPoleV1/" + datetime.now().strftime( "%Y%m%d-%H%M%S") self.tensorboard_callback = keras.callbacks.TensorBoard( log_dir=logdir) self.visualize = True elif env == "Dobot": self.env = dobotGym.dobotGym(emulateOculus=emulateOculus, episodeLength=self.episodeLength, visualize=visualize, teachingFilesPath=teachingFilesPath, dobotEmulation=dobotEmulation) self.states = self.env.observation_space.shape[0] self.actions = self.env.action_space.shape[0] self.saveFileName = 'sarsa_weights_dobot.h5f' logdir = "logs/Dobot/" + datetime.now().strftime("%Y%m%d-%H%M%S") self.tensorboard_callback = keras.callbacks.TensorBoard( log_dir=logdir) self.visualize = True else: raise TypeError("Wrong env") print( 'States', self.states ) # To get an idea about the number of variables affecting the environment print( 'Actions', self.actions ) # To get an idea about the number of possible actions in the environment, do [right,left] # # episodes = 10 # for episode in range(1, episodes + 1): # # At each begining reset the game # state = self.env.reset() # # set done to False # done = False # # set score to 0 # score = 0 # # while the game is not finished # while not done: # # visualize each step # self.env.render() # # choose a random action # action = random.choice([0, 1]) # # execute the action # n_state, reward, done, info = self.env.step(action) # # keep track of rewards # score += reward # print('episode {} score {}'.format(episode, score)) # not working :( # self.agent = self.agentDDP(self.states, self.actions) # self.agent = self.NAFAgent(self.states, self.actions) # self.policy = EpsGreedyQPolicy() self.savingFreq = 100 self.actualSaving = 0 self.model = self.agentSarsa(self.states, self.actions) self.policy = LinearAnnealedPolicy( inner_policy=self.policyValues["inner_policy"], attr=self.policyValues["attr"], value_max=self.policyValues["value_max"], value_min=self.policyValues["value_min"], value_test=self.policyValues["value_test"], nb_steps=self.policyValues["nb_steps"]) self.agent = SARSAAgent(model=self.model, policy=self.policy, nb_actions=self.actions) self.agent._is_graph_network = True def t(): return False self.agent._in_multi_worker_mode = t self.agent.save = self.saveAgentWeights def lenmeh(): return self.actions # self.agent.__len__ = lenmeh def saveAgentWeights(self, path, overwrite=True): if self.actualSaving < self.savingFreq: self.actualSaving += 1 return None else: self.actualSaving = 0 path = 'model/checkpoint/' + datetime.now().strftime( "%Y%m%d-%H%M%S") + self.saveFileName self.agent.save_weights(path, overwrite) def agentSarsa(self, states, actions): self.model = Sequential() self.model.add(LSTM(42, activation='sigmoid', input_shape=(1, states))) self.model.add(Dense(42, activation='sigmoid')) self.model.add(Dense(42, activation='sigmoid')) self.model.add(Dense(24, activation='sigmoid')) self.model.add(Dense(12, activation='sigmoid')) self.model.add(Dense(actions, activation='linear')) self.path = fileOperation.saveToFolder(self.model.to_json(), name='modelShape', folder="model\\checkpoint") # , stateful=False states are resetted together after each batch. # model.add(Flatten(input_shape=(1, states))) # dot_img_file = '/model_1.png' # keras.utils.plot_model(self.model, to_file=dot_img_file, show_shapes=True) # model.reset_states() return self.model def load(self): path = fileOperation.openDialogFunction(".h5f") self.agent.compile('adam', metrics=['mse']) self.agent.load_weights(path) self.agent.compile('adam', metrics=['mse']) def test(self, nb_episodes=2): _ = self.agent.test(self.env, nb_episodes=nb_episodes, visualize=self.visualize) def fit(self, visualize=False): checkpoint_filepath = 'model/checkpoint/' model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_filepath, save_weights_only=False, save_freq=25) self.agent.compile('adam', metrics=['mse']) self.agent.fit( self.env, nb_steps=self.policyValues["nb_steps"], log_interval=self.episodeLength, visualize=visualize, verbose=1, nb_max_start_steps=1, start_step_policy=self.model.reset_states, # callbacks=[PlotLossesKeras()]) callbacks=[self.tensorboard_callback, model_checkpoint_callback], ) scores = self.agent.test(self.env, nb_episodes=5, visualize=visualize) print('Average score over 5 test games:{}'.format( np.mean(scores.history['episode_reward'])))