def demo(self, agent, env, EPISODES, state_size, batch_size): done = False for e in range(EPISODES): state = env.reset() env.render() state = np.reshape(state, [1, state_size]) for episode in range(500): action = agent.act(state) next_state, reward, done, _ = env.step(action) reward = reward if not done else -10 next_state = np.reshape(next_state, [1, state_size]) state = next_state if done: break return 'done'
def train(self, agent, env, EPISODES, state_size, batch_size): done = False for e in range(EPISODES): state = env.reset() state = np.reshape(state, [1, state_size]) for episode in range(500): # env.render() action = agent.act(state) next_state, reward, done, _ = env.step(action) reward = reward if not done else -10 next_state = np.reshape(next_state, [1, state_size]) agent.remember(state, action, reward, next_state, done) state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}".format( e, EPISODES, episode, agent.epsilon)) break if len(agent.memory) > batch_size: agent.replay(batch_size) return agent
def demo_q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): Q = defaultdict(lambda: np.zeros(env.action_space.n)) policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in range(num_episodes): # Print out which episode we're on, useful for debugging. if (i_episode + 1) % 100 == 0: print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes)) sys.stdout.flush() # Reset the environment and pick the first action state = env.reset() # One step in the environment total_reward = 0.0 for t in itertools.count(): # Take a step action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) # TD Update best_next_action = np.argmax(Q[next_state]) print("Decided Action: ", best_next_action) td_target = reward + discount_factor * Q[next_state][ best_next_action] td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta if done: break state = next_state return Q
def ddpg(env, agent, brain_name, action_size, n_episodes=2000, max_t=1000, n_agent=20): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores best_score = 0 for i_episode in range(1, n_episodes + 1): env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations agent.noise_reset() agent_scores = [0] * n_agent for step in range(max_t): actions = agent.act(states, step) env_info = env.step(actions)[ brain_name] # send the action to the environment next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished for i_agent in range(n_agent): agent_scores[i_agent] += rewards[i_agent] agent.step(states[i_agent], actions[i_agent], rewards[i_agent], next_states[i_agent], dones[i_agent], i_agent) states = next_states if any(dones): break score = np.mean(agent_scores) scores_window.append(score) # save most recent score scores.append(score) # save most recent score if best_score < score: best_score = score print( '\rEpisode {}\t Episode score: {:.2f}\t Average Score: {:.2f}\t Best Score: {:.2f}' .format(i_episode, score, np.mean(scores_window), best_score), end="") if i_episode % 100 == 0: print( '\rEpisode {}\t Current score: {:.2f}\t Average Score: {:.2f}'. format(i_episode, score, np.mean(scores_window))) if np.mean(scores_window) >= 30: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) agent.save_model() break env.close() return scores
if np.mean(scores_window) >= 30: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) agent.save_model() break env.close() return scores if __name__ == "__main__": brain_name = env.brain_names[0] brain = env.brains[brain_name] print("Brain name: ", brain_name) # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Action size', action_size) # examine the state space state = env_info.vector_observations[0] state_size = len(state) print('States have length:', state_size) agent = Agent(state_size=state_size, action_size=action_size, seed=2,
os.makedirs(f'models/{MODEL_NAME}-{START_TIME}') agent = DQNAgent() episodes_reward = [] for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'): # Update tensorboard step every episode agent.tensorboard.step = episode # Restarting episode - reset episode reward and step number episode_reward = 0 step = 1 # Reset environment and get initial state current_state = env.reset() # Reset flag and start iterating until episode ends done = False while not done: # This part stays mostly the same, the change is to query a model for Q values if np.random.random() > EPSILON: # Get action from Q table action = np.argmax(agent.get_qs(current_state)) else: # Get random action action = np.random.randint(0, env.action_space.n) new_state, reward, done = env.step(action)[:3] reward = env.compute_reward(new_state)
### Creation of the brain brain = brain.NN(nb_actions = numb_actions) model = brain.model ### Creation of the memory of the DQN Agent DQN = DQN() if(env.train): previous_loss = 0 patience = 0 for epoch in range(0,epochs): loss = 0 time_step = 0 game_over = False total_reward = 0 new_month = np.random.randint(0,12) env.reset(new_month) game_over = env.game_over state, _, _ = env.observation() while not game_over and time_step < 5 * 30 * 24 * 60 : print(time_step) ### Choosing the action if np.random.rand() < eps: ### exploration action = np.random.randint(0,numb_actions) if (action - direction_boundary) < 0: direction = -1 if (action - direction_boundary) > 0: direction = 1 energy_ai = abs(action - direction_boundary) * temp_incr else: action = np.argmax(model.predict(state)) if (action - direction_boundary) < 0: