def main(): # initialize OpenAI Gym env and dqn agent env = gym.make(ENV_NAME) agent = DQN(env) for episode in range(EPISODE): # initialize task state = env.reset() # Train for step in range(STEP): action = agent.egreedy_action(state) # e-greedy action for train next_state, reward, done, _ = env.step(action) # Define reward for agent reward_agent = -1 if done else 0.1 agent.perceive(state, action, reward, next_state, done) state = next_state if done: break # Test every 100 episodes if episode % 100 == 0: total_reward = 0 for i in range(TEST): state = env.reset() for j in range(STEP): env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break ave_reward = total_reward / TEST print('episode: ', episode, 'Evaluation Average Reward:', ave_reward) if ave_reward >= 200: break
def main(): env = gym.make(ENV) agent = DQN(env.observation_space.shape[0], env.action_space.n, logdir='/data/log/LunarLander-v2') for episode in xrange(EPISODE): state = env.reset() for step in xrange(STEP): env.render() action = agent.egreedy_action(state) next_state, reward, terminate, _ = env.step(action) agent.observe_action(state, action, reward, next_state, terminate) state = next_state if terminate: break if episode % 100 == 0: total_reward = 0 for i in range(5): state = env.reset() for j in xrange(STEP): env.render() action = agent.action(state) state, reward, terminate, _ = env.step(action) total_reward += reward if terminate: break agent.summary(episode, total_reward / 5)
def main(lr=0.001, episodeMemory=100, replaySize=64, gamma=0.95): np.random.seed(0) env = gym.make('MountainCar-v0') model = keras.Sequential() model.add( Dense(128, activation="relu", input_dim=3, kernel_initializer='normal')) model.add(Dense(52, activation="relu")) model.add(Dense(1, kernel_initializer='normal', activation="linear")) adam = keras.optimizers.Adam(lr=lr) model.compile(loss='mean_squared_error', optimizer=adam) #gamma = 0.95 memorySize = 200 * episodeMemory dqn = DQN(model, gamma, memorySize, replaysize=replaySize, _env=env) dqnScore = dqnScorerMountainCar(dqn, _env=env) nrofEpisodes = 1001 #nrofEpisodes = 20 res = np.zeros(shape=(nrofEpisodes, 2)) for episode in range(nrofEpisodes): env.reset() action = 0 obs, _, done, _ = env.step(action) #if (episode % 100) == 10: if (episode % 100) == 10: print("episode ", episode) dqnScore.printDistance() #dqnScore.plot_cost_to_ßgo_mountain_car() #print(res[episode-1,:]) print("--- %s seconds ---" % (time.time() - start_time)) iter = 0 while not done: iter += 1 action = dqn.action(obs) new_obs, reward, done, info = env.step(action) if (done and (iter < 199)): reward = (200 - iter) / 10 print("****Success*****", -iter) dqn.add(action, obs, new_obs, reward) obs = new_obs #if(episode % 100) == 10: # env.render()j dqn.replay() env.reset() dqnScore.updateResult(iter) #res[episode,:] = [np.min(x[:,0]),np.max(x[:,0])] title = "eps_%d_mem_%d_rep_%d_gamma_%d" % (nrofEpisodes, episodeMemory, replaySize, gamma * 100) dqnScore.plotResults(title) dqnScore.plot_cost_to_go_mountain_car(title)
class CartPolePlay(object): def __init__( self, hidden_dims, step_to_copy_graph=300, step_each_epsiode=500, ): self.step_to_copy_graph = step_to_copy_graph self.step_each_epsiode = step_each_epsiode self.dqn = DQN(4, 2, hidden_dims) self.env = gym.make('CartPole-v0') def train(self, num_train=5000): running_score = 0.0 num_epsiode = 0 num_step = 0 for _ in range(num_train): state = self.env.reset() for t in range(self.step_each_epsiode): num_step += 1 action = self.dqn.action(state) next_state, reward, done, _ = self.env.step(action) reward = -100 if done else 0.1 self.dqn.remember(state, action, reward, done, next_state) state = next_state self.dqn.learn() if num_step % self.step_to_copy_graph == 0: self.dqn.copy_graph() if done: running_score += t break num_epsiode += 1 self.dqn.decrease_epsilon() if num_epsiode % 100 == 0: running_score /= 100 print("Current running score is: %.2f" % running_score) if running_score > 195.0: print("HaHa, solved in: %d" % num_epsiode) return True running_score = 0.0 return False def play(self, num_epsiode): total_score = 0.0 for _ in range(num_epsiode): state = self.env.reset() while True: action = self.dqn.play(state) next_state, reward, done, _ = self.env.step(action) if done: break total_score += 1 state = next_state return total_score / num_epsiode def store(self): self.dqn.save('model/dqn/cartpole-v0.ckpt')
# start training / playing in this episode / game until this game over action = agent.egreedy_action(state) # perhaps env.step return an extra arg, but we ignore it with _ next_state, reward, done, _ = env.step(action) reward_agent = -1 if done else 0.1 agent.preceive(state, action, reward, next_state, done) state = next_state if done: # when the episode is complete break # test for every 100 episode if episode % 100 == 0: total_reward = 0 for i in range(TEST): state = env.reset() for j in range(STEP): env.render() # the only difference than training. we directly return action produced by DQN action = agent.action(state) state, reward, done, _ = env.step(action=action) total_reward += reward if done: break ave_reward = total_reward / TEST print("episode: {} avg reward: {}".format(episode, ave_reward)) if ave_reward >= 200: break