示例#1
0
 def demo(self, agent, env, EPISODES, state_size, batch_size):
     done = False
     for e in range(EPISODES):
         state = env.reset()
         env.render()
         state = np.reshape(state, [1, state_size])
         for episode in range(500):
             action = agent.act(state)
             next_state, reward, done, _ = env.step(action)
             reward = reward if not done else -10
             next_state = np.reshape(next_state, [1, state_size])
             state = next_state
             if done:
                 break
     return 'done'
示例#2
0
 def train(self, agent, env, EPISODES, state_size, batch_size):
     done = False
     for e in range(EPISODES):
         state = env.reset()
         state = np.reshape(state, [1, state_size])
         for episode in range(500):
             # env.render()
             action = agent.act(state)
             next_state, reward, done, _ = env.step(action)
             reward = reward if not done else -10
             next_state = np.reshape(next_state, [1, state_size])
             agent.remember(state, action, reward, next_state, done)
             state = next_state
             if done:
                 print("episode: {}/{}, score: {}, e: {:.2}".format(
                     e, EPISODES, episode, agent.epsilon))
                 break
             if len(agent.memory) > batch_size:
                 agent.replay(batch_size)
     return agent
示例#3
0
def demo_q_learning(env,
                    num_episodes,
                    discount_factor=1.0,
                    alpha=0.5,
                    epsilon=0.1):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for i_episode in range(num_episodes):
        # Print out which episode we're on, useful for debugging.
        if (i_episode + 1) % 100 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes))
            sys.stdout.flush()

        # Reset the environment and pick the first action
        state = env.reset()

        # One step in the environment
        total_reward = 0.0
        for t in itertools.count():

            # Take a step
            action_probs = policy(state)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            next_state, reward, done, _ = env.step(action)

            # TD Update
            best_next_action = np.argmax(Q[next_state])
            print("Decided Action: ", best_next_action)
            td_target = reward + discount_factor * Q[next_state][
                best_next_action]
            td_delta = td_target - Q[state][action]
            Q[state][action] += alpha * td_delta

            if done:
                break

            state = next_state
    return Q
示例#4
0
def ddpg(env,
         agent,
         brain_name,
         action_size,
         n_episodes=2000,
         max_t=1000,
         n_agent=20):
    """Deep Q-Learning.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    best_score = 0
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        states = env_info.vector_observations
        agent.noise_reset()
        agent_scores = [0] * n_agent
        for step in range(max_t):
            actions = agent.act(states, step)
            env_info = env.step(actions)[
                brain_name]  # send the action to the environment
            next_states = env_info.vector_observations  # get the next state
            rewards = env_info.rewards  # get the reward
            dones = env_info.local_done  # see if episode has finished
            for i_agent in range(n_agent):
                agent_scores[i_agent] += rewards[i_agent]
                agent.step(states[i_agent], actions[i_agent], rewards[i_agent],
                           next_states[i_agent], dones[i_agent], i_agent)
            states = next_states
            if any(dones):
                break
        score = np.mean(agent_scores)
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        if best_score < score:
            best_score = score
        print(
            '\rEpisode {}\t Episode score: {:.2f}\t Average Score: {:.2f}\t Best Score: {:.2f}'
            .format(i_episode, score, np.mean(scores_window), best_score),
            end="")
        if i_episode % 100 == 0:
            print(
                '\rEpisode {}\t Current score: {:.2f}\t Average Score: {:.2f}'.
                format(i_episode, score, np.mean(scores_window)))
        if np.mean(scores_window) >= 30:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            agent.save_model()
            break
    env.close()
    return scores
示例#5
0
        if np.mean(scores_window) >= 30:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            agent.save_model()
            break
    env.close()
    return scores


if __name__ == "__main__":
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    print("Brain name: ", brain_name)
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents in the environment
    print('Number of agents:', len(env_info.agents))

    # number of actions
    action_size = brain.vector_action_space_size
    print('Action size', action_size)

    # examine the state space
    state = env_info.vector_observations[0]
    state_size = len(state)
    print('States have length:', state_size)
    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  seed=2,
示例#6
0
    os.makedirs(f'models/{MODEL_NAME}-{START_TIME}')

agent = DQNAgent()

episodes_reward = []
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):

    # Update tensorboard step every episode
    agent.tensorboard.step = episode

    # Restarting episode - reset episode reward and step number
    episode_reward = 0
    step = 1

    # Reset environment and get initial state
    current_state = env.reset()

    # Reset flag and start iterating until episode ends
    done = False
    while not done:

        # This part stays mostly the same, the change is to query a model for Q values
        if np.random.random() > EPSILON:
            # Get action from Q table
            action = np.argmax(agent.get_qs(current_state))
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done = env.step(action)[:3]
        reward = env.compute_reward(new_state)
示例#7
0
### Creation of the brain
brain = brain.NN(nb_actions = numb_actions)
model = brain.model
### Creation of the memory of the DQN Agent
DQN = DQN()

if(env.train):
    previous_loss = 0
    patience = 0
    for epoch in range(0,epochs):
        loss = 0
        time_step = 0
        game_over = False
        total_reward = 0
        new_month = np.random.randint(0,12)
        env.reset(new_month)
        game_over = env.game_over
        state, _, _ = env.observation()
        while not game_over and time_step < 5 * 30 * 24 * 60 :
            print(time_step)
            ### Choosing the action
            if np.random.rand() < eps:   ### exploration
                action = np.random.randint(0,numb_actions)
                if (action - direction_boundary) < 0:
                    direction = -1
                if (action - direction_boundary) > 0:
                    direction = 1
                energy_ai = abs(action - direction_boundary) * temp_incr
            else:
                action = np.argmax(model.predict(state))
                if (action - direction_boundary) < 0: