示例#1
0
def main():
    running_reward = 10
    episode_durations = []
    for i_episode in count(1):
        state, ep_reward = env.reset(), 0
        for t in range(1, 10000):  # Don't infinite loop while learning
            action = select_action(state)
            state, reward, done, _ = env.step(action)
            if args.render:
                env.render()
            policy.rewards.append(reward)
            ep_reward += reward
            if done:
                episode_durations.append(t + 1)
                plot_durations(episode_durations)
                break

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        finish_episode()
        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.
                  format(i_episode, ep_reward, running_reward))
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(
                      running_reward, t))
            break
示例#2
0
from model_DDDQN import load_checkpoint
from utils import plot_durations
import matplotlib.pyplot as plt
import torch

# IMPORTANT: Set value for i_episode to indicate which checkpoint you want to use
#   for evaluation.
i_episode = 400
ckpt_dir = "DDDQN_CartPoleV1_obs_checkpoints/"
input_size = 4
output_size = 2

# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Read checkpoint
policy_net, _, _, _, _, episode_rewards, episode_loss = \
    load_checkpoint(ckpt_dir, i_episode, input_size, output_size, device=device)

# Plot figure
plot_durations(episode_rewards, episode_loss)
示例#3
0
    optimize_model(policy_net, batch_log_prob, batch_rewards, optimizer, GAMMA, device=device)

    # Clear trajectories batch
    batch_log_prob = []
    batch_rewards = []

    # Reset Flags
    if not(render_each_episode):
        finished_rendering_this_epoch = False

    # Record stats
    training_info["epoch mean durations"].append(sum(epoch_durations) / batch_size)
    training_info["epoch mean rewards"].append(sum(epoch_rewards) / batch_size)
    if (i_epoch + 1) % num_avg_epoch:
        training_info["past %d epochs mean reward" %  (num_avg_epoch)] = \
            (sum(training_info["epoch mean rewards"][-num_avg_epoch:]) / num_avg_epoch) \
                if len(training_info["epoch mean rewards"]) >= num_avg_epoch else 0


    # Plot stats
    plot_durations(training_info["epoch mean rewards"])

    # Update counter
    i_epoch += 1

    # Every save_ckpt_interval, save a checkpoint according to current i_episode.
    if (i_epoch) % save_ckpt_interval == 0:
        save_checkpoint(ckpt_dir, policy_net, optimizer, i_epoch, learning_rate=learning_rate,
                        **training_info)

示例#4
0
def trainDQN(file_name="DQN",
             env=GridworldEnv(1),
             batch_size=128,
             gamma=0.999,
             eps_start=0.9,
             eps_end=0.05,
             eps_decay=1000,
             is_plot=False,
             num_episodes=500,
             max_num_steps_per_episode=1000,
             learning_rate=0.0001,
             memory_replay_size=10000):
    """
    DQN training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    if is_plot:
        env.reset()
        plt.ion()
        plt.figure()
        plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(),
                   interpolation='none')
        plt.title("")
        plt.draw()
        plt.pause(0.00001)

    num_actions = env.action_space.n
    model = DQN(num_actions)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model.cuda()

    memory = ReplayMemory(memory_replay_size)

    episode_durations = []
    mean_durations = []
    episode_rewards = []
    mean_rewards = []
    steps_done = 0  # total steps
    for i_episode in range(num_episodes):
        if i_episode % 20 == 0:
            clear_output()
        print("Cur episode:", i_episode, "steps done:", steps_done,
                "exploration factor:", eps_end + (eps_start - eps_end) * \
                math.exp(-1. * steps_done / eps_decay))
        # Initialize the environment and state
        env.reset()
        # last_screen = env.current_grid_map
        # (1, 1, 8, 8)
        current_screen = get_screen(env)
        state = current_screen  # - last_screen
        for t in count():
            # Select and perform an action
            action = select_action(state, model, num_actions, eps_start,
                                   eps_end, eps_decay, steps_done)
            steps_done += 1
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen  # - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state
            # plot_state(state)
            # env.render()

            # Perform one step of the optimization (on the target network)
            optimize_model(model, optimizer, memory, batch_size, gamma)
            if done or t + 1 >= max_num_steps_per_episode:
                episode_durations.append(t + 1)
                episode_rewards.append(env.episode_total_reward)
                if is_plot:
                    plot_durations(episode_durations, mean_durations)
                    plot_rewards(episode_rewards, mean_rewards)
                break

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-dqn-rewards', episode_rewards)
    np.save(file_name + '-dqn-durations', episode_durations)

    return model, episode_rewards, episode_durations
示例#5
0
def trainSQL0(file_name="SQL0",
              env=GridworldEnv(1),
              batch_size=128,
              gamma=0.999,
              beta=5,
              eps_start=0.9,
              eps_end=0.05,
              eps_decay=1000,
              is_plot=False,
              num_episodes=200,
              max_num_steps_per_episode=1000,
              learning_rate=0.0001,
              memory_replay_size=10000,
              n_step=10,
              target_update=10):
    """
    Soft Q-learning training routine when observation vector is input
    Retuns rewards and durations logs.
    """

    num_actions = env.action_space.n
    input_size = env.observation_space.shape[0]
    model = DQN(input_size, num_actions)
    target_model = DQN(input_size, num_actions)
    target_model.load_state_dict(model.state_dict())
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model.cuda()

    memory = ReplayMemory(memory_replay_size, n_step, gamma)

    episode_durations = []
    mean_durations = []
    episode_rewards = []
    mean_rewards = []

    steps_done, t = 0, 0

    for i_episode in range(num_episodes):
        if i_episode % 20 == 0:
            clear_output()
        if i_episode != 0:
            print("Cur episode:", i_episode, "steps done:", episode_durations[-1],
                    "exploration factor:", eps_end + (eps_start - eps_end) * \
                    math.exp(-1. * steps_done / eps_decay), "reward:", env.episode_total_reward)
        # Initialize the environment and state
        state = torch.from_numpy(env.reset()).type(torch.FloatTensor).view(
            -1, input_size)

        for t in count():
            # Select and perform an action
            action = select_action(state, model, num_actions, eps_start,
                                   eps_end, eps_decay, steps_done)
            next_state_tmp, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            next_state = torch.from_numpy(next_state_tmp).type(
                torch.FloatTensor).view(-1, input_size)

            if done:
                next_state = None

            # Store the transition in memory
            memory.push(model, target_model, state, action, next_state, reward)

            # Move to the next state
            state = next_state
            # plot_state(state)
            # env.render()

            # Perform one step of the optimization (on the target network)
            optimize_model(model, target_model, optimizer, memory, batch_size,
                           gamma, beta)  #### Difference w.r.t DQN
            if done or t + 1 >= max_num_steps_per_episode:
                episode_durations.append(t + 1)
                episode_rewards.append(
                    env.episode_total_reward
                )  ##### Modify for OpenAI envs such as CartPole
                if is_plot:
                    plot_durations(episode_durations, mean_durations)
                    plot_rewards(episode_rewards, mean_rewards)
                steps_done += 1
                break
        if i_episode % target_update == 0 and i_episode != 0:
            target_model.load_state_dict(model.state_dict())

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results
    np.save(file_name + '-sql0-rewards', episode_rewards)
    np.save(file_name + '-sql0-durations', episode_durations)

    return model, episode_rewards, episode_durations
示例#6
0
            training_info["episode reward"].append(running_reward)
            if running_reward > training_info["max reward achieved"]:
                training_info["max reward achieved"] = running_reward
            training_info["past 100 episodes mean reward"] = \
                (sum(training_info["episode reward"][-100:]) / 100) if len(training_info["episode reward"])>=100 else 0
            training_info["training loss"].append(running_minibatch_loss /
                                                  (t + 1))
            training_info["episode loss"].append(running_episode_loss / t)
            if (running_episode_loss /
                    t) > training_info["max episode loss recorded"]:
                training_info[
                    "max episode loss recorded"] = running_episode_loss / t

            # Plot stats
            plot_durations(training_info["episode reward"],
                           training_info["training loss"],
                           training_info["episode loss"])

            print("=============  Episode: %d  =============" %
                  (i_episode + 1))
            print("Episode reward: %d" % training_info["episode reward"][-1])
            print("Episode duration: %d" % (t + 1))
            print("Training loss: %f" % training_info["training loss"][-1])
            print("Episode loss: %f \n" % training_info["episode loss"][-1])
            print("Max reward achieved: %f" %
                  training_info["max reward achieved"])
            print("Max TD loss recorded: %f" %
                  training_info["max TD loss recorded"])
            print("Max episode loss recorded: %f" %
                  training_info["max episode loss recorded"])
            print("Past 100 episodes avg reward: %f \n\n" %
                              memory,
                              policy_net,
                              target_net,
                              optimizer,
                              GAMMA=GAMMA,
                              device=device)
        if loss is not None:
            running_loss += loss

        if done:
            # Save and print episode stats (duration and episode loss)
            episode_durations.append(t + 1)
            mean_duration = (sum(episode_durations[-100:]) /
                             100) if len(episode_durations) >= 100 else 0
            episode_loss.append(running_loss / (t + 1))
            plot_durations(episode_durations, episode_loss)

            print(
                "Episode: %d Cumulative Rewards: %d Episode Loss: %f, past 100 episodes avg reward: %f"
                % (i_episode + 1, t + 1,
                   (running_loss / (t + 1)), mean_duration))
            # Check if the problem is solved
            #  CartPole standard: average reward for the past 100 episode above 195
            if mean_duration > 195:
                print("\n\n\t Problem Solved !!!\n\n\n")

            break
    i_episode += 1

    # Update the target network, copying all weights and biases in DQN
    if i_episode % target_update == 0:
示例#8
0
        last_screen = current_screen
        current_screen = get_screen(env, device)

        if not done:
            next_state = current_screen - last_screen
        else:
            next_state = None

        memory.push(state, action, next_state, reward)

        state = next_state
        #if done:
        #    print "Episode Done"
        #else:
        #    print state.size()
        optimize_model(policy_net, optimizer)
        if done:
            episode_durations.append(t + 1)
            plot_durations(episode_durations, AVERAGE_SIZE)
            break

    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

print("Complet")
env.render()
env.close()
plt.ioff()
plt.show()
示例#9
0
def train():
    global memory
    try:
        memory, ct, steps = pickle.load(open("cache.p", "rb"))
    except:
        print("Starting from scratch........!!!!!!")
        memory = ReplayMemory(10000)
        steps = 0
        ct = 0
    game_evn.jump()
    try:
        while True:
            score = 0
            current_screen = game_evn.capture_screen() / 255
            current_screen_torch = torch.from_numpy(current_screen).unsqueeze(
                0).unsqueeze(0)
            state = current_screen_torch
            for t in count():
                sample = random.random()
                threshold = eps_end + (eps_start - eps_end) * math.exp(
                    -1. * steps / eps_decay)
                steps += 1
                if sample > threshold:
                    with torch.no_grad():
                        action = policy_net(state.float()).max(1)[1].view(1, 1)
                else:
                    action = torch.tensor([[random.randrange(3)]],
                                          device=device,
                                          dtype=torch.long)

                current_screen, reward, is_gameover, score = game_state.get_state(
                    action.item())
                reward = torch.tensor([reward], device=device)
                score += reward
                current_screen = game_evn.capture_screen() / 255
                current_screen_torch = torch.from_numpy(
                    current_screen).unsqueeze(0).unsqueeze(0)
                if not is_gameover:
                    next_state = current_screen_torch
                else:
                    next_state = None
                memory.push(state, action, next_state, reward)
                state = next_state
                optimize_model()
                if is_gameover:
                    episode_durations.append(t + 1)
                    plot_durations(episode_durations)
                    break
            if ct % 100 == 0:
                game_evn.pause_game()
                with open("cache.p", "wb") as cache:
                    pickle.dump((memory, ct, steps), cache)
                target_net.load_state_dict(policy_net.state_dict())
                gc.collect()
                torch.save(
                    {
                        "policy_net": policy_net.state_dict(),
                        "target_net": target_net.state_dict(),
                        "optimizer": optimizer.state_dict()
                    }, checkpoint_name)
                game_evn.resume_game()
                print(f"{ct} running.....")

            ct += 1
    except KeyboardInterrupt:
        torch.save(
            {
                "policy_net": policy_net.state_dict(),
                "target_net": target_net.state_dict(),
                "optimizer": optimizer.state_dict()
            }, checkpoint_name)
示例#10
0
from save_and_load import load_checkpoint
from utils import plot_durations
import matplotlib.pyplot as plt
import torch

# IMPORTANT: Set value for i_episode to indicate which checkpoint you want to use
#   for evaluation.
i_episode = 7300
start_idx = 6500
end_idx = 7200
ckpt_dir = "DDDQN_SGD_CartPoleV1_obs_checkpoints/"

input_size = 4
output_size = 2

# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Read checkpoint
_, _, _, training_info = \
    load_checkpoint(ckpt_dir, i_episode, input_size, output_size, device=device)

# Plot figure
plot_durations(training_info["episode reward"], training_info["training loss"],
               training_info["episode loss"], (start_idx, end_idx))
示例#11
0
def train():
    # Graph Part
    print("Graph initialization...")
    xdim = xtrim[1] - xtrim[0]
    ydim = ytrim[1] - ytrim[0]
    channel=3
    num_action = env.action_space.n
    policy_net = NETWORK(ydim=ydim, xdim=xdim, channel=channel,
                        num_action=num_action,
                        learning_rate=learning_rate,
                        batch_size=batch_size)

    target_net = NETWORK(ydim=ydim, xdim=xdim, channel=channel,
                        num_action=num_action,
                        learning_rate=learning_rate,
                        batch_size=batch_size)
    policy_net.to(DEVICE)
    target_net.to(DEVICE)

    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    # Memory
    memory = utils.ReplayMemory(10000)

    # ETCs
    steps_done = 0
    episode_durations = []

    policy_net.float()
    target_net.float()

    print("Training Start.....")
    for episode in range(num_episodes):
        REWARD = 0
        previous_screenshot = utils.dimension_manipulation(env.reset()[xtrim[0]:xtrim[1], ytrim[0]:ytrim[1]])
        current_screenshot = previous_screenshot
        state = torch.from_numpy(current_screenshot - previous_screenshot).float().to(DEVICE)
        for t in count():
            #env.render()
            action = utils.select_action(state, steps_done, policy_net)
            observation, reward, done, _ = env.step(action.item())
            previous_screenshot = current_screenshot
            current_screenshot = utils.dimension_manipulation(observation[xtrim[0]:xtrim[1], ytrim[0]:ytrim[1]])

            if not done:
                next_status = torch.from_numpy(current_screenshot - previous_screenshot).float().to(DEVICE)
                REWARD += reward
            else:
                next_status = None
            if True :
                memory.push(state,
                            action,
                            next_status,
                            torch.tensor(float(t+1)).to(DEVICE)[None])
            state = next_status
            utils.optimize_model(policy_net, target_net, memory, batch_size)

            if done:
                utils.optimize_model(policy_net, target_net, memory, batch_size)
                episode_durations.append(t + 1)
                utils.plot_durations(episode_durations)
                if REWARD != 0:
                    print("\n########  Episode " + str(episode))
                    print("Duration : " + str(t + 1))
                    print("REWARD : " + str(REWARD))
                    print("loss : " + str(policy_net.loss.item()))
                break
        if episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
示例#12
0
        if not done:
            next_state = current_screen - last_screen
        else:
            next_state = None

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the target network)
        optimize_model()
        if done:
            episode_durations.append(t + 1)
            plot_durations(episode_durations)
            break
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

print('Complete')
env.render()
env.close()
plt.ioff()
plt.show()

######################################################################
# Here is the diagram that illustrates the overall resulting data flow.
#
# .. figure:: /_static/img/reinforcement_learning_diagram.jpg
示例#13
0
    training_info["value net loss"].append(value_net_mse)
    if (i_epoch + 1) % num_avg_epoch:
        training_info["past %d epochs mean reward" %  (num_avg_epoch)] = \
            (sum(training_info["epoch mean rewards"][-num_avg_epoch:]) / num_avg_epoch) \
                if len(training_info["epoch mean rewards"]) >= num_avg_epoch else 0

    # Print stats
    print("\n\n=============  Epoch: %d  =============" % (i_epoch + 1))
    print("epoch mean durations: %f" % (epoch_durations[-1]))
    print("epoch mean rewards: %f" % (epoch_rewards[-1]))
    print("Max reward achieved: %f" % training_info["max reward achieved"])
    print("value net loss: %f" % value_net_mse)

    # Plot stats
    if plot:
        plot_durations(training_info["epoch mean rewards"],
                       training_info["value net loss"])

    # Update counter
    i_epoch += 1

    # Every save_ckpt_interval, save a checkpoint according to current i_episode.
    if i_epoch % save_ckpt_interval == 0:
        save_checkpoint(ckpt_dir,
                        policy_net,
                        value_net,
                        policynet_optimizer,
                        valuenet_optimizer,
                        i_epoch,
                        policy_lr=policy_lr,
                        valuenet_lr=valuenet_lr,
                        **training_info)
示例#14
0
    n_games = 1000
    score = 0

    print("Save is currently !!!!!!!!!!!!!!!!!! ", A.save)

    for i in range(n_games):
        A.env.reset()
        last_screen = A.get_state()
        current_screen = A.get_state()
        state = current_screen - last_screen

        done = False
        score = 0

        if i % 20 == 0 and i > 0:
            plot_durations(scores, 0.001)
            print('----------------- training --------------------')
            print('epsiode number', i)
            print("Average score ", avg_score[-1])
            print('----------------- training --------------------')

        while not done:
            action = A.choose_action(state)

            _, reward, done, _ = A.env.step(action)

            last_screen = current_screen
            current_screen = A.get_state()

            next_state = current_screen - last_screen
示例#15
0
from save_and_load import load_checkpoint
from utils import plot_durations
import matplotlib.pyplot as plt
import torch

# IMPORTANT: Set value for i_episode to indicate which checkpoint you want to use
#   for evaluation.
i_epoch = 650
start_idx = 0
end_idx = i_epoch

input_size = 8
output_size = 4
layer_sizes = [input_size, 128, 128, 128,
               output_size]  # The MLP network architecture

env_name = "LunarLander-v2"
ckpt_dir = "simplePG_Adam_%s_obs_checkpoints/" % (env_name)

# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Read checkpoint
_, _, training_info = \
    load_checkpoint(ckpt_dir, i_epoch, layer_sizes, device=device)

# Plot figure
plot_durations(training_info["epoch mean rewards"], (start_idx, end_idx))