def collect_data(self): roll_out = util.ReplayMemory(1e6) state = self.env.reset() done = False while not done: state_tensor = torch.Tensor(state).float().unsqueeze(0) action_probs = self.old_model(state_tensor) probs = Categorical(action_probs) action = probs.sample().item() next_state, reward, done, info = self.env.step(action) if done: next_state = None roll_out.push(util.Transition(state, action, next_state, reward)) state = next_state return roll_out.memory
import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions import Categorical from torch.autograd import Variable from torch.nn.utils.convert_parameters import vector_to_parameters, parameters_to_vector CAPACITY = 100000 BATCHSIZE = 128 GAMMA = 0.99 # set device cpu or gpu random.seed(100) replay_memory = util.ReplayMemory(CAPACITY) class DDPG(object): def __init__(self, env, actor, critic, target_actor, target_critic, num_episode, replay_memory, gamma, lr=0.001): self.env = env self.actor = actor
def main(cli_args): parser = argparse.ArgumentParser( description="CSCE 496 HW 3, SeaQuest RL Homework") parser.add_argument('--n_step', type=int, default=2, help='N-Step time differences for DQN Update Function') parser.add_argument('--lambda', type=int, default=0.5, help="Value for Temporal Difference Calculation") parser.add_argument('--batch_size', type=int, default=32, help="Batch Size") parser.add_argument( '--model_dir', type=str, default='./homework_3/', help='directory where model graph and weights are saved') parser.add_argument('--epoch', type=int, default=500, help="Epoch : number of iterations for the model") parser.add_argument('--model', type=int, help=" '1' for basic model, '2' for best model") parser.add_argument( '--stopCount', type=int, default=100, help="Number of times for dropping accuracy before early stopping") args_input = parser.parse_args(cli_args) if args_input.model: model = args_input.model else: raise ValueError("Model selection must not be empty") if args_input.batch_size: batch_size = args_input.batch_size if args_input.model_dir: model_dir = args_input.model_dir else: raise ValueError("Provide a valid model data path") if args_input.epoch: epochs = args_input.epoch else: raise ValueError("Epoch value cannot be null and has to be an integer") #Make output model dir if os.path.exists(model_dir) == False: os.mkdir(model_dir) #Placeholder for Tensorflow Variables x = tf.placeholder(tf.float32, [None, 84, 84, 4], name='input_placeholder') #4 frames y = tf.placeholder(tf.float32, [None, 18], name='output') #18 possible outputs #Setup LEARNING_RATE = 0.0001 TARGET_UPDATE_STEP_FREQ = 5 number_of_episodes = epochs replay_memory = util.ReplayMemory(1000000) #Optimizer optimizer = tf.train.AdamOptimizer(LEARNING_RATE) #Load "SeaQuest" from atari_wrapper.py seaquest_env = util.load_seaquest_env() NUM_ACTIONS = seaquest_env.action_space.n #18 Possible Actions OBS_SHAPE = seaquest_env.observation_space.shape # [height, width, channels] = [84, 84, 4] EPS_END = 0.1 EPS_DECAY = 100000 step = 0 grad_norm_clipping = 1.0 global_step_tensor = util.global_step_tensor('global_step_tensor') if (str(model) == '1'): policy_model, policy_output_layer = initiate_policy_model( x, NUM_ACTIONS) target_model, target_output_layer = initiate_target_model( x, NUM_ACTIONS) print("Basic Model Initialized") elif (str(model) == '2'): policy_model, policy_output_layer = initiate_better_policy_model( x, NUM_ACTIONS) target_model, target_output_layer = initiate_better_target_model( x, NUM_ACTIONS) print("Better Model Initialized") prev_episode_score = -1 saver = tf.train.Saver() argmax_action = tf.argmax(policy_output_layer, axis=1) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for episode in range(number_of_episodes): # When an episode is done, Reset prev_observation = seaquest_env.reset() observation, reward, done, _ = seaquest_env.step( random.randrange(NUM_ACTIONS)) done = False episode_score = 0.0 loss = 0 while not done: #Policy model learns to move in the game prep_obs = np.expand_dims(np.array(observation, dtype=np.float32), axis=0) curr_action = util.epsilon_greedy_exploration( x, sess, argmax_action, prep_obs, step, NUM_ACTIONS, EPS_END, EPS_DECAY) observation, reward, done, _ = seaquest_env.step(curr_action) next_obs = np.expand_dims(np.array(observation, dtype=np.float32), axis=0) next_action = util.epsilon_greedy_exploration( x, sess, argmax_action, next_obs, step, NUM_ACTIONS, EPS_END, EPS_DECAY) following_observation, next_reward, next_done, _ = seaquest_env.step( next_action) replay_memory.push( prev_observation, curr_action, observation, reward, next_action, next_reward, following_observation) # s, a , r, s' a' r' s'' prev_observation = observation #Target Model (Critic) #pylint: disable=too-many-function-args #Check to see if there are enough transistions to form a batch if len(replay_memory) > 1000: state_batch, action_batch, reward_batch, next_state_batch, next_action_batch, next_reward_batch, following_state_batch = util.batch_sampling( replay_memory, batch_size) with sess.as_default(): gradients, variables, loss = util.dqn_gradient_calculation( action_batch, next_state_batch, following_state_batch, reward_batch, next_reward_batch, x, y, policy_output_layer, target_output_layer, sess, batch_size, optimizer, grad_norm_clipping) if gradients is not None: gradients, _ = tf.clip_by_global_norm( gradients, grad_norm_clipping) print(f"Gradients {gradients}") operation = optimizer.apply_gradients( zip(gradients, variables)) train_op = util.training_op(operation, optimizer, global_step_tensor) sess.run([train_op], {x: prep_obs}) episode_score += next_reward step += 1 print( f"Episode : {episode} Episode Score : {episode_score} Step: {step} Loss : {loss}" ) #Saving Function if episode % TARGET_UPDATE_STEP_FREQ == 0: for (target_var, policy_var) in zip(tf.trainable_variables(target_model), tf.trainable_variables(policy_model)): tf.assign(target_var, policy_var) if episode_score >= prev_episode_score: print("Saving .........") saver.save(sess, os.path.join("./homework_3/", "homework_3")) prev_episode_score = episode_score
else: actor_cur_iter = -1 if opt.load_disc: state_dict, optimizer_dict, disc_cur_iter, buffer = torch.load( opt.load_disc) disc.load_state_dict(state_dict) disc_optimizer.load_state_dict(optimizer_dict) print('Loaded disc from', opt.load_disc) else: disc_cur_iter = -1 assert opt.replay_size >= opt.batch_size if opt.exp_replay_buffer: buffer = util.ExponentialReplayMemory(opt.replay_size, opt.replay_size_half) else: buffer = util.ReplayMemory(opt.replay_size) if opt.load_actor: state_dict, optimizer_dict, critic_cur_iter = torch.load( opt.load_critic) critic.load_state_dict(state_dict) critic_optimizer.load_state_dict(optimizer_dict) print('Loaded critic from', opt.load_critic) else: critic_cur_iter = -1 start_iter = min(actor_cur_iter, disc_cur_iter, critic_cur_iter) + 1 solved = 0 solved_fail = 0 print('\nReal examples:') task.display(task.get_data(opt.batch_size)) print()
def main(): global ACTION_SPACE, epsilon_start, epsilon_end, epsilon_decay, env, screen_width global actor, target, optimizer, gamma, loss_fn, frame_num, use_ddqn ENV = "CartPole-v0" # observation shape: (250, 160, 3) env = gym.make(ENV).unwrapped ACTION_SPACE = env.action_space.n screen_width = 600 # hyperparameter n_episodes = 2000 buffer_size = 1000000 epsilon_start = 1 epsilon_end = 0.01 epsilon_decay = 1000 replay_memory = util.ReplayMemory(buffer_size) # memory = pickle.load(open("replay_memory_18000.pkl", "rb")) # replay_memory.memory = memory # print("Current memory length: ", len(replay_memory)) batch_size = 128 frame_num = 3 gamma = 0.99 lr = 1e-3 C = 10 use_ddqn = False actor = DQN(ACTION_SPACE, frame_num).to(device) target = deepcopy(actor).to(device) optimizer = RMSprop(actor.parameters(), lr=lr) loss_fn = nn.MSELoss() Transition = collections.namedtuple( "transition", ("state", "action", "next_state", "reward")) # training episode_rewards = [] average_qvalue = [] for episode in range(n_episodes): env.reset() last_screen = get_screen() current_screen = get_screen() state = current_screen - last_screen done = False reward_sum = 0 qvalue_sum = torch.zeros(ACTION_SPACE) while not done: action_value = actor(state) action = epsilon_greedy(action_value, episode, env) _, reward, done, _ = env.step(action) if not done: last_screen = get_screen() current_screen = get_screen() next_state = current_screen - last_screen else: next_state = None replay_memory.push(Transition(state, action, next_state, reward)) if not done: state = next_state qvalue = train(replay_memory, batch_size) reward_sum += reward if qvalue is not None: qvalue_sum = qvalue_sum + qvalue episode_rewards.append(reward_sum) mean_qvalue = qvalue_sum.mean().item() average_qvalue.append(mean_qvalue) if episode % 100 == 0: print("Episode {}, last 100 episode rewards {}, total average rewards {}".format(episode, \ np.mean(episode_rewards[-100:]), np.mean(episode_rewards))) print("Episode {}, last 100 episode qvalue {}, total average rewards {}".format(episode, \ np.mean(average_qvalue[-100:]), np.mean(average_qvalue))) if episode % C == 0: target = deepcopy(actor).to(device) plt.subplot(121) plt.plot(range(len(episode_rewards)), episode_rewards, "b-") plt.xlabel("episode") plt.ylabel("reward") # q value plt.subplot(122) plt.plot(range(len(average_qvalue)), average_qvalue, "r-") plt.xlabel("episode") plt.ylabel("qvalue") plt.savefig("episode_qvalue_cartpole.png") env.close()
def main(): ENV = "Boxing-v0" # observation shape: (250, 160, 3) env = gym.make(ENV).unwrapped global ACTION_SPACE, epsilon_start, epsilon_end, epsilon_decay global actor, target, optimizer, gamma, loss_fn, frame_num, use_ddqn ACTION_SPACE = env.action_space.n # hyperparameter n_epochs = 20 buffer_size = 1000000 epsilon_start = 1 epsilon_end = 0.01 epsilon_decay = 1000000 replay_memory = util.ReplayMemory(buffer_size) # memory = pickle.load(open("replay_memory_18000.pkl", "rb")) # replay_memory.memory = memory # print("Current memory length: ", len(replay_memory)) batch_size = 32 frame_num = 4 gamma = 0.99 lr = 2.5e-4 C = 100 learning_starts = 10000 learning_freq = 4 use_ddqn = False actor = DQN(ACTION_SPACE, frame_num).to(device) target = deepcopy(actor).to(device) optimizer = RMSprop(actor.parameters(), lr=lr) loss_fn = nn.MSELoss() Transition = collections.namedtuple( "transition", ("state", "action", "next_state", "reward")) # training running_rewards = [] average_qvalue = [] episode_rewards = [] state = env.reset() frames = [] if frame_num > 1: for _ in range(frame_num): frames.append(util.preprocess(state)) state = torch.cat(frames, dim=1).float() else: state = util.preprocess(state) reward_sum = 0 done = False qvalue_sum = torch.zeros(batch_size, 1) num_param_update = 0 for t in count(): if t % 2000 == 0: # save replay memory # pickle.dump(replay_memory.memory, open("replay_memory_{}.pkl".format(t), "wb")) print("Finish step {}".format(t)) epoch = t // learning_starts if t > learning_starts: action_value = actor(state) action = epsilon_greedy(action_value, t - learning_starts, env) else: action = env.action_space.sample() state_buffer = [] reward_buffer = [] for _ in range(frame_num): next_state, reward, done, _ = env.step(action) state_buffer.append(util.preprocess(next_state)) reward_sum += reward reward_buffer.append(util.scale_reward(reward)) if done: break next_state = torch.cat(state_buffer, dim=1).float() if done: next_state = None replay_memory.push( Transition(state, action, next_state, sum(reward_buffer))) if not done: state = next_state # train if t > learning_starts and t % learning_freq == 0 and len( replay_memory) > batch_size: qvalue = train(replay_memory, batch_size) average_qvalue.append((qvalue).mean().item()) episode_rewards.append(reward_sum) if len(episode_rewards) > 0: average_episode_reward = np.mean(episode_rewards[-100:]) print("Epoch {}, Step {}, Average Q value {}, Average episode reward {}".format(epoch, t, \ average_qvalue[-1], average_episode_reward)) num_param_update += 1 # reset game if done: state = env.reset() frames = [] if frame_num > 1: for _ in range(frame_num): frames.append(util.preprocess(state)) state = torch.cat(frames, dim=1).float().to(device) else: state = util.preprocess(state).to(device) reward_sum = 0 # validation if num_param_update % C == 0: target = deepcopy(actor).to(device) if epoch == n_epochs: break # if t > 1e6: # break # episode rewards plt.subplot(121) plt.plot(range(len(episode_rewards)), episode_rewards, "b-") plt.xlabel("step") plt.ylabel("reward") # q value plt.subplot(122) plt.plot(range(len(average_qvalue)), average_qvalue, "r-") plt.xlabel("step") plt.ylabel("qvalue") plt.savefig("episode_qvalue.png")