示例#1
0
	def collect_data(self):
		roll_out = util.ReplayMemory(1e6)

		state = self.env.reset()
		done = False
		while not done:
			state_tensor = torch.Tensor(state).float().unsqueeze(0)
			action_probs = self.old_model(state_tensor)
			probs = Categorical(action_probs)
			action = probs.sample().item()
			next_state, reward, done, info = self.env.step(action)
			if done:
				next_state = None
			roll_out.push(util.Transition(state, action, next_state, reward))
			state = next_state
		return roll_out.memory
示例#2
0
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
from torch.autograd import Variable
from torch.nn.utils.convert_parameters import vector_to_parameters, parameters_to_vector

CAPACITY = 100000
BATCHSIZE = 128
GAMMA = 0.99
# set device cpu or gpu
random.seed(100)

replay_memory = util.ReplayMemory(CAPACITY)


class DDPG(object):
    def __init__(self,
                 env,
                 actor,
                 critic,
                 target_actor,
                 target_critic,
                 num_episode,
                 replay_memory,
                 gamma,
                 lr=0.001):
        self.env = env
        self.actor = actor
示例#3
0
def main(cli_args):
    parser = argparse.ArgumentParser(
        description="CSCE 496 HW 3, SeaQuest RL Homework")
    parser.add_argument('--n_step',
                        type=int,
                        default=2,
                        help='N-Step time differences for DQN Update Function')
    parser.add_argument('--lambda',
                        type=int,
                        default=0.5,
                        help="Value for Temporal Difference Calculation")
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help="Batch Size")
    parser.add_argument(
        '--model_dir',
        type=str,
        default='./homework_3/',
        help='directory where model graph and weights are saved')
    parser.add_argument('--epoch',
                        type=int,
                        default=500,
                        help="Epoch : number of iterations for the model")
    parser.add_argument('--model',
                        type=int,
                        help=" '1' for basic model, '2' for best model")
    parser.add_argument(
        '--stopCount',
        type=int,
        default=100,
        help="Number of times for dropping accuracy before early stopping")
    args_input = parser.parse_args(cli_args)

    if args_input.model:
        model = args_input.model
    else:
        raise ValueError("Model selection must not be empty")

    if args_input.batch_size:
        batch_size = args_input.batch_size

    if args_input.model_dir:
        model_dir = args_input.model_dir
    else:
        raise ValueError("Provide a valid model data path")

    if args_input.epoch:
        epochs = args_input.epoch
    else:
        raise ValueError("Epoch value cannot be null and has to be an integer")

    #Make output model dir
    if os.path.exists(model_dir) == False:
        os.mkdir(model_dir)

    #Placeholder for Tensorflow Variables
    x = tf.placeholder(tf.float32, [None, 84, 84, 4],
                       name='input_placeholder')  #4 frames
    y = tf.placeholder(tf.float32, [None, 18],
                       name='output')  #18 possible outputs

    #Setup
    LEARNING_RATE = 0.0001
    TARGET_UPDATE_STEP_FREQ = 5
    number_of_episodes = epochs
    replay_memory = util.ReplayMemory(1000000)
    #Optimizer
    optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
    #Load "SeaQuest" from atari_wrapper.py
    seaquest_env = util.load_seaquest_env()
    NUM_ACTIONS = seaquest_env.action_space.n  #18 Possible Actions
    OBS_SHAPE = seaquest_env.observation_space.shape  # [height, width, channels] = [84, 84, 4]
    EPS_END = 0.1
    EPS_DECAY = 100000
    step = 0
    grad_norm_clipping = 1.0
    global_step_tensor = util.global_step_tensor('global_step_tensor')
    if (str(model) == '1'):
        policy_model, policy_output_layer = initiate_policy_model(
            x, NUM_ACTIONS)
        target_model, target_output_layer = initiate_target_model(
            x, NUM_ACTIONS)
        print("Basic Model Initialized")
    elif (str(model) == '2'):
        policy_model, policy_output_layer = initiate_better_policy_model(
            x, NUM_ACTIONS)
        target_model, target_output_layer = initiate_better_target_model(
            x, NUM_ACTIONS)
        print("Better Model Initialized")
    prev_episode_score = -1
    saver = tf.train.Saver()
    argmax_action = tf.argmax(policy_output_layer, axis=1)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for episode in range(number_of_episodes):
            # When an episode is done, Reset
            prev_observation = seaquest_env.reset()
            observation, reward, done, _ = seaquest_env.step(
                random.randrange(NUM_ACTIONS))
            done = False
            episode_score = 0.0
            loss = 0
            while not done:
                #Policy model learns to move in the game
                prep_obs = np.expand_dims(np.array(observation,
                                                   dtype=np.float32),
                                          axis=0)
                curr_action = util.epsilon_greedy_exploration(
                    x, sess, argmax_action, prep_obs, step, NUM_ACTIONS,
                    EPS_END, EPS_DECAY)
                observation, reward, done, _ = seaquest_env.step(curr_action)
                next_obs = np.expand_dims(np.array(observation,
                                                   dtype=np.float32),
                                          axis=0)
                next_action = util.epsilon_greedy_exploration(
                    x, sess, argmax_action, next_obs, step, NUM_ACTIONS,
                    EPS_END, EPS_DECAY)
                following_observation, next_reward, next_done, _ = seaquest_env.step(
                    next_action)
                replay_memory.push(
                    prev_observation, curr_action, observation, reward,
                    next_action, next_reward,
                    following_observation)  # s, a , r, s' a' r' s''
                prev_observation = observation
                #Target Model (Critic)
                #pylint: disable=too-many-function-args
                #Check to see if there are enough transistions to form a batch
                if len(replay_memory) > 1000:
                    state_batch, action_batch, reward_batch, next_state_batch, next_action_batch, next_reward_batch, following_state_batch = util.batch_sampling(
                        replay_memory, batch_size)
                    with sess.as_default():
                        gradients, variables, loss = util.dqn_gradient_calculation(
                            action_batch, next_state_batch,
                            following_state_batch, reward_batch,
                            next_reward_batch, x, y, policy_output_layer,
                            target_output_layer, sess, batch_size, optimizer,
                            grad_norm_clipping)
                    if gradients is not None:
                        gradients, _ = tf.clip_by_global_norm(
                            gradients, grad_norm_clipping)
                        print(f"Gradients {gradients}")
                        operation = optimizer.apply_gradients(
                            zip(gradients, variables))
                        train_op = util.training_op(operation, optimizer,
                                                    global_step_tensor)
                        sess.run([train_op], {x: prep_obs})
                episode_score += next_reward
                step += 1
            print(
                f"Episode : {episode} Episode Score : {episode_score} Step: {step} Loss : {loss}"
            )

            #Saving Function
            if episode % TARGET_UPDATE_STEP_FREQ == 0:
                for (target_var,
                     policy_var) in zip(tf.trainable_variables(target_model),
                                        tf.trainable_variables(policy_model)):
                    tf.assign(target_var, policy_var)
            if episode_score >= prev_episode_score:
                print("Saving .........")
                saver.save(sess, os.path.join("./homework_3/", "homework_3"))
                prev_episode_score = episode_score
示例#4
0
    else:
        actor_cur_iter = -1
    if opt.load_disc:
        state_dict, optimizer_dict, disc_cur_iter, buffer = torch.load(
            opt.load_disc)
        disc.load_state_dict(state_dict)
        disc_optimizer.load_state_dict(optimizer_dict)
        print('Loaded disc from', opt.load_disc)
    else:
        disc_cur_iter = -1
        assert opt.replay_size >= opt.batch_size
        if opt.exp_replay_buffer:
            buffer = util.ExponentialReplayMemory(opt.replay_size,
                                                  opt.replay_size_half)
        else:
            buffer = util.ReplayMemory(opt.replay_size)
    if opt.load_actor:
        state_dict, optimizer_dict, critic_cur_iter = torch.load(
            opt.load_critic)
        critic.load_state_dict(state_dict)
        critic_optimizer.load_state_dict(optimizer_dict)
        print('Loaded critic from', opt.load_critic)
    else:
        critic_cur_iter = -1
    start_iter = min(actor_cur_iter, disc_cur_iter, critic_cur_iter) + 1

    solved = 0
    solved_fail = 0
    print('\nReal examples:')
    task.display(task.get_data(opt.batch_size))
    print()
示例#5
0
def main():

    global ACTION_SPACE, epsilon_start, epsilon_end, epsilon_decay, env, screen_width
    global actor, target, optimizer, gamma, loss_fn, frame_num, use_ddqn
    ENV = "CartPole-v0"  # observation shape:  (250, 160, 3)
    env = gym.make(ENV).unwrapped
    ACTION_SPACE = env.action_space.n
    screen_width = 600

    # hyperparameter
    n_episodes = 2000
    buffer_size = 1000000
    epsilon_start = 1
    epsilon_end = 0.01
    epsilon_decay = 1000
    replay_memory = util.ReplayMemory(buffer_size)
    # memory = pickle.load(open("replay_memory_18000.pkl", "rb"))
    # replay_memory.memory = memory
    # print("Current memory length: ", len(replay_memory))
    batch_size = 128
    frame_num = 3
    gamma = 0.99
    lr = 1e-3
    C = 10
    use_ddqn = False

    actor = DQN(ACTION_SPACE, frame_num).to(device)
    target = deepcopy(actor).to(device)
    optimizer = RMSprop(actor.parameters(), lr=lr)
    loss_fn = nn.MSELoss()
    Transition = collections.namedtuple(
        "transition", ("state", "action", "next_state", "reward"))

    # training
    episode_rewards = []
    average_qvalue = []
    for episode in range(n_episodes):
        env.reset()
        last_screen = get_screen()
        current_screen = get_screen()
        state = current_screen - last_screen
        done = False
        reward_sum = 0
        qvalue_sum = torch.zeros(ACTION_SPACE)
        while not done:
            action_value = actor(state)
            action = epsilon_greedy(action_value, episode, env)
            _, reward, done, _ = env.step(action)
            if not done:
                last_screen = get_screen()
                current_screen = get_screen()
                next_state = current_screen - last_screen
            else:
                next_state = None
            replay_memory.push(Transition(state, action, next_state, reward))
            if not done:
                state = next_state
            qvalue = train(replay_memory, batch_size)
            reward_sum += reward
            if qvalue is not None:
                qvalue_sum = qvalue_sum + qvalue

        episode_rewards.append(reward_sum)
        mean_qvalue = qvalue_sum.mean().item()
        average_qvalue.append(mean_qvalue)
        if episode % 100 == 0:
            print("Episode {}, last 100 episode rewards {}, total average rewards {}".format(episode, \
              np.mean(episode_rewards[-100:]), np.mean(episode_rewards)))
            print("Episode {}, last 100 episode qvalue {}, total average rewards {}".format(episode, \
              np.mean(average_qvalue[-100:]), np.mean(average_qvalue)))
        if episode % C == 0:
            target = deepcopy(actor).to(device)

        plt.subplot(121)
        plt.plot(range(len(episode_rewards)), episode_rewards, "b-")
        plt.xlabel("episode")
        plt.ylabel("reward")

        # q value
        plt.subplot(122)
        plt.plot(range(len(average_qvalue)), average_qvalue, "r-")
        plt.xlabel("episode")
        plt.ylabel("qvalue")
        plt.savefig("episode_qvalue_cartpole.png")

    env.close()
示例#6
0
def main():
    ENV = "Boxing-v0"  # observation shape:  (250, 160, 3)
    env = gym.make(ENV).unwrapped
    global ACTION_SPACE, epsilon_start, epsilon_end, epsilon_decay
    global actor, target, optimizer, gamma, loss_fn, frame_num, use_ddqn
    ACTION_SPACE = env.action_space.n

    # hyperparameter
    n_epochs = 20
    buffer_size = 1000000
    epsilon_start = 1
    epsilon_end = 0.01
    epsilon_decay = 1000000
    replay_memory = util.ReplayMemory(buffer_size)
    # memory = pickle.load(open("replay_memory_18000.pkl", "rb"))
    # replay_memory.memory = memory
    # print("Current memory length: ", len(replay_memory))
    batch_size = 32
    frame_num = 4
    gamma = 0.99
    lr = 2.5e-4
    C = 100
    learning_starts = 10000
    learning_freq = 4
    use_ddqn = False

    actor = DQN(ACTION_SPACE, frame_num).to(device)
    target = deepcopy(actor).to(device)
    optimizer = RMSprop(actor.parameters(), lr=lr)
    loss_fn = nn.MSELoss()
    Transition = collections.namedtuple(
        "transition", ("state", "action", "next_state", "reward"))

    # training
    running_rewards = []
    average_qvalue = []
    episode_rewards = []

    state = env.reset()
    frames = []
    if frame_num > 1:
        for _ in range(frame_num):
            frames.append(util.preprocess(state))
        state = torch.cat(frames, dim=1).float()
    else:
        state = util.preprocess(state)

    reward_sum = 0
    done = False
    qvalue_sum = torch.zeros(batch_size, 1)
    num_param_update = 0
    for t in count():
        if t % 2000 == 0:
            # save replay memory
            # pickle.dump(replay_memory.memory, open("replay_memory_{}.pkl".format(t), "wb"))
            print("Finish step {}".format(t))
        epoch = t // learning_starts
        if t > learning_starts:
            action_value = actor(state)
            action = epsilon_greedy(action_value, t - learning_starts, env)
        else:
            action = env.action_space.sample()

        state_buffer = []
        reward_buffer = []
        for _ in range(frame_num):
            next_state, reward, done, _ = env.step(action)
            state_buffer.append(util.preprocess(next_state))
            reward_sum += reward
            reward_buffer.append(util.scale_reward(reward))
            if done:
                break
        next_state = torch.cat(state_buffer, dim=1).float()

        if done:
            next_state = None

        replay_memory.push(
            Transition(state, action, next_state, sum(reward_buffer)))

        if not done:
            state = next_state

        # train
        if t > learning_starts and t % learning_freq == 0 and len(
                replay_memory) > batch_size:
            qvalue = train(replay_memory, batch_size)
            average_qvalue.append((qvalue).mean().item())
            episode_rewards.append(reward_sum)
            if len(episode_rewards) > 0:
                average_episode_reward = np.mean(episode_rewards[-100:])
            print("Epoch {}, Step {}, Average Q value {}, Average episode reward {}".format(epoch, t, \
             average_qvalue[-1], average_episode_reward))
            num_param_update += 1

        # reset game
        if done:
            state = env.reset()
            frames = []
            if frame_num > 1:
                for _ in range(frame_num):
                    frames.append(util.preprocess(state))
                state = torch.cat(frames, dim=1).float().to(device)
            else:
                state = util.preprocess(state).to(device)
            reward_sum = 0

        # validation
        if num_param_update % C == 0:
            target = deepcopy(actor).to(device)

        if epoch == n_epochs:
            break
        # if t > 1e6:
        # 	break

        # episode rewards
        plt.subplot(121)
        plt.plot(range(len(episode_rewards)), episode_rewards, "b-")
        plt.xlabel("step")
        plt.ylabel("reward")

        # q value
        plt.subplot(122)
        plt.plot(range(len(average_qvalue)), average_qvalue, "r-")
        plt.xlabel("step")
        plt.ylabel("qvalue")
        plt.savefig("episode_qvalue.png")