def __init__( self, env, q_func, optimizer_spec, session, exploration=LinearSchedule(1000000, 0.1), stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, rew_file=None, double_q=True, lander=False, explore='e-greedy', ex2=False, min_replay_size=10000, # not sure ex2_len=1000, coef=0.01, seed=250, evaluation=False, directory='./models/model1'): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: img_in: tf.Tensor tensorflow tensor representing the input image num_actions: int number of actions scope: str scope in which all the model related variables should be created reuse: bool whether previously created variables should be reused. optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer session: tf.Session tensorflow session to use. exploration: rl_algs.deepq.utils.schedules.Schedule schedule for probability of chosing random action. stopping_criterion: (env, t) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network grad_norm_clipping: float or None If not None gradients' norms are clipped to this value. double_q: bool If True, then use double Q-learning to compute target values. Otherwise, use vanilla DQN. https://papers.nips.cc/paper/3964-double-q-learning.pdf """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete self.target_update_freq = target_update_freq self.optimizer_spec = optimizer_spec self.batch_size = batch_size self.learning_freq = learning_freq self.learning_starts = learning_starts self.stopping_criterion = stopping_criterion self.env = env # Double (need to modify) graph_1 = tf.Graph() graph_2 = tf.Graph() # Settings for Atari Ram tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) self.session1 = tf.Session(config=tf_config, graph=graph_1) self.session2 = tf.Session(config=tf_config, graph=graph_2) # print("AVAILABLE GPUS: ", get_available_gpus()) self.exploration = exploration self.rew_file = str( uuid.uuid4()) + '.pkl' if rew_file is None else rew_file self.double_q = double_q self.explore = explore # EX2 # [1e-3, 1e-4, 1e-5] self.coef = coef self.first_train = True self.first_train_itrs = int(5e3) self.train_itrs = int(1e3) self.ex2 = ex2 self.min_replay_size = min_replay_size self.ex2_len = ex2_len self.count = 0 self.seed = seed self.eval = evaluation print('eval?', self.eval) print('exploration strategy', explore) print('using ex2', ex2) print('using coef', coef) ############### # BUILD MODEL # ############### if len(self.env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) # IT is what I am debugging on! input_shape = self.env.observation_space.shape else: img_h, img_w, img_c = self.env.observation_space.shape input_shape = (img_h, img_w, frame_history_len * img_c) self.num_actions = self.env.action_space.n if self.eval: # Model 1 with graph_1.as_default(): saver1 = tf.train.import_meta_graph( './models/Jamesbond_soft_q_ex2_e4.meta') saver1.restore(self.session1, './models/Jamesbond_soft_q_ex2_e4') self.obs_t_ph1 = tf.get_collection('obs_t_ph')[0] self.Temp1 = tf.get_collection('Temp')[0] self.keep_per1 = tf.get_collection('keep_per')[0] self.q_dist1 = tf.get_collection('q_dist')[0] self.q_t1 = tf.get_collection('q_t')[0] # Ex2 if self.ex2: self.ex2_in1_1 = tf.get_collection('ex2_in1')[0] self.ex2_in2_1 = tf.get_collection('ex2_in2')[0] self.ex2_dis_output1 = tf.get_collection( 'ex2_dis_output')[0] self.ex2_prob1 = tf.get_collection('ex2_prob')[0] # Model 2 with graph_2.as_default(): saver2 = tf.train.import_meta_graph( './models/Alien_soft_q_ex2_e4.meta') saver2.restore(self.session2, './models/Alien_soft_q_ex2_e4') self.obs_t_ph2 = tf.get_collection('obs_t_ph')[0] self.Temp2 = tf.get_collection('Temp')[0] self.keep_per2 = tf.get_collection('keep_per')[0] self.q_dist2 = tf.get_collection('q_dist')[0] self.q_t2 = tf.get_collection('q_t')[0] # Ex2 if self.ex2: self.ex2_in1_2 = tf.get_collection('ex2_in1')[0] self.ex2_in2_2 = tf.get_collection('ex2_in2')[0] self.ex2_dis_output2 = tf.get_collection( 'ex2_dis_output')[0] self.ex2_prob2 = tf.get_collection('ex2_prob')[0] self.model_initialized = True # print('obs is here',self.obs_t_ph) # print(self.Temp) # print(self.keep_per) # print(self.q_dist) print('restored and initialized the model') else: # set up placeholders # placeholder for current observation (or state) self.obs_t_ph = tf.placeholder(tf.float32 if lander else tf.uint8, [None] + list(input_shape)) # placeholder for current action self.act_t_ph = tf.placeholder(tf.int32, [None]) # placeholder for current reward self.rew_t_ph = tf.placeholder(tf.float32, [None]) # placeholder for next observation (or state) self.obs_tp1_ph = tf.placeholder( tf.float32 if lander else tf.uint8, [None] + list(input_shape)) # placeholder for end of episode mask # this value is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target, not the # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1) self.done_mask_ph = tf.placeholder(tf.float32, [None]) # casting to float on GPU ensures lower data transfer times. # TO-DO: WHY? if lander: obs_t_float = self.obs_t_ph obs_tp1_float = self.obs_tp1_ph else: obs_t_float = tf.cast(self.obs_t_ph, tf.float32) / 255.0 obs_tp1_float = tf.cast(self.obs_tp1_ph, tf.float32) / 255.0 # Here, you should fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # TensorFlow will differentiate this error for you, you just need to pass it to the # optimizer. See assignment text for details. # Your code should produce one scalar-valued tensor: total_error # This will be passed to the optimizer in the provided code below. # Your code should also produce two collections of variables: # q_func_vars # target_q_func_vars # These should hold all of the variables of the Q-function network and target network, # respectively. A convenient way to get these is to make use of TF's "scope" feature. # For example, you can create your Q-function network with the scope "q_func" like this: # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) # And then you can obtain the variables like this: # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES" # Tip: use huber_loss (from dqn_utils) instead of squared error when defining self.total_error ###### # YOUR CODE HERE # For bayesian exploration: Add dropout value to the network # Get Q-function and target network self.keep_per = tf.placeholder(shape=None, dtype=tf.float32) if self.explore == 'bayesian': print('Bayesian variables defined!') dropout = True else: dropout = False # EX2 if self.ex2: print('Use Exemplar Model') self.exemplar = Exemplar(input_dim=input_shape[0], seed=self.seed, eval=self.eval) q_t = q_func(obs_t_float, self.num_actions, scope='q_func', reuse=False, dropout=dropout, keep_prob=self.keep_per) q_tp1 = q_func(obs_tp1_float, self.num_actions, scope='target_q_func_vars', reuse=False, dropout=dropout, keep_prob=self.keep_per) # For boltzmann exploration if self.explore == 'soft_q': print('Boltzman variables defined!') self.Temp = tf.placeholder(shape=None, dtype=tf.float32) # print(q_t) #value = tf.reduce_mean(q_t, 1) # print(value) # print(q_t - value) # print(self.q_dist) # exit() # self.q_dist = tf.nn.softmax(q_t/self.Temp) # # Old version # value = tf.log( tf.reduce_sum(tf.exp(q_t),1) ) # self.q_dist = tf.exp(q_t - value) # New version self.q_dist = tf.nn.softmax(q_t / self.Temp) # Max operation self.q_t_action = tf.argmax(q_t, axis=1) # value = tf.reduce_mean(q_t) # self.q_t_action = tf.nn.softmax(q_t - value) # Specify double Q function difference if self.double_q: print('using double q learning') # TO-DO: VERY VERY IMPORTANT TO REUSE VAIRABLES # TO-DO: DO WE NEED TO SET GRADIENT NOT UPDATE q_tp1_target = q_func(obs_tp1_float, self.num_actions, scope='q_func', reuse=True) q_tp1_target_action = tf.argmax(q_tp1_target, axis=1) q_tp1_max = tf.reduce_sum( q_tp1 * tf.one_hot(indices=q_tp1_target_action, depth=self.num_actions, on_value=1.0, off_value=0.0), axis=1) else: # Soft maximum if self.explore == 'soft_q': print('using soft q learning') # q_tp1_max = tf.log( tf.reduce_sum(tf.exp(q_tp1),1) ) q_tp1_max = tf.reduce_logsumexp(q_tp1, 1) # print(q_tp1_max) # exit() else: q_tp1_max = tf.reduce_max(q_tp1, 1) # Get target value q_tp1 = gamma * (1.0 - self.done_mask_ph) * q_tp1_max target = self.rew_t_ph + q_tp1 # Get Q_fai(si,ai) # TO-DO: VERY VERY IMPORTANT! use reduce_sum instead of reduce_max since exist negative value q_t_target = tf.reduce_sum(q_t * tf.one_hot(indices=self.act_t_ph, depth=self.num_actions, on_value=1.0, off_value=0.0), axis=1) # Calculate loss self.total_error = target - q_t_target self.total_error = tf.reduce_mean(huber_loss(self.total_error)) # Produce collections of variables to update separately q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_func_vars') if 0: print(q_t.get_shape()) print(q_tp1.get_shape()) print(self.q_t_action.get_shape()) print(self.done_mask_ph.get_shape()) print(q_tp1_max.get_shape()) print(q_tp1.get_shape()) print(q_t_target.get_shape()) print(self.total_error.get_shape()) exit() ###### # construct optimization op (with gradient clipping) self.learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") optimizer = self.optimizer_spec.constructor( learning_rate=self.learning_rate, **self.optimizer_spec.kwargs) self.train_fn = minimize_and_clip(optimizer, self.total_error, var_list=q_func_vars, clip_val=grad_norm_clipping) # update_target_fn will be called periodically to copy Q network to target Q network update_target_fn = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_fn.append(var_target.assign(var)) self.update_target_fn = tf.group(*update_target_fn) # construct the replay buffer self.replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len, lander=lander) self.replay_buffer_idx = None ############### # RUN ENV # ############### if not self.eval: self.model_initialized = False self.num_param_updates = 0 self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') # last_obs intialized here self.last_obs = self.env.reset() self.log_every_n_steps = 10000 self.timesteps = [] self.mean_episode_rewards = [] self.best_mean_episode_rewards = [] self.start_time = None self.t = 0 # EX2 if not eval: self.saver = tf.train.Saver() tf.add_to_collection('obs_t_ph', self.obs_t_ph) tf.add_to_collection('Temp', self.Temp) tf.add_to_collection('keep_per', self.keep_per) tf.add_to_collection('q_dist', self.q_dist) tf.add_to_collection('q_t', q_t) if self.ex2: in1, in2, dis_output, prob = self.exemplar.model.predict_tensor( ) tf.add_to_collection('ex2_in1', in1) tf.add_to_collection('ex2_in2', in2) tf.add_to_collection('ex2_dis_output', dis_output) tf.add_to_collection('ex2_prob', prob) if self.ex2 and not self.eval: self.exemplar.model.init_tf_sess(self.session) self.model_initialized = True """
class QLearner(object): def __init__( self, env, q_func, optimizer_spec, session, exploration=LinearSchedule(1000000, 0.1), stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, rew_file=None, double_q=True, lander=False, explore='e-greedy', ex2=False, min_replay_size=10000, # not sure ex2_len=1000, coef=0.01, seed=250, evaluation=False, directory='./models/model1'): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: img_in: tf.Tensor tensorflow tensor representing the input image num_actions: int number of actions scope: str scope in which all the model related variables should be created reuse: bool whether previously created variables should be reused. optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer session: tf.Session tensorflow session to use. exploration: rl_algs.deepq.utils.schedules.Schedule schedule for probability of chosing random action. stopping_criterion: (env, t) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network grad_norm_clipping: float or None If not None gradients' norms are clipped to this value. double_q: bool If True, then use double Q-learning to compute target values. Otherwise, use vanilla DQN. https://papers.nips.cc/paper/3964-double-q-learning.pdf """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete self.target_update_freq = target_update_freq self.optimizer_spec = optimizer_spec self.batch_size = batch_size self.learning_freq = learning_freq self.learning_starts = learning_starts self.stopping_criterion = stopping_criterion self.env = env # Double (need to modify) graph_1 = tf.Graph() graph_2 = tf.Graph() # Settings for Atari Ram tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) self.session1 = tf.Session(config=tf_config, graph=graph_1) self.session2 = tf.Session(config=tf_config, graph=graph_2) # print("AVAILABLE GPUS: ", get_available_gpus()) self.exploration = exploration self.rew_file = str( uuid.uuid4()) + '.pkl' if rew_file is None else rew_file self.double_q = double_q self.explore = explore # EX2 # [1e-3, 1e-4, 1e-5] self.coef = coef self.first_train = True self.first_train_itrs = int(5e3) self.train_itrs = int(1e3) self.ex2 = ex2 self.min_replay_size = min_replay_size self.ex2_len = ex2_len self.count = 0 self.seed = seed self.eval = evaluation print('eval?', self.eval) print('exploration strategy', explore) print('using ex2', ex2) print('using coef', coef) ############### # BUILD MODEL # ############### if len(self.env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) # IT is what I am debugging on! input_shape = self.env.observation_space.shape else: img_h, img_w, img_c = self.env.observation_space.shape input_shape = (img_h, img_w, frame_history_len * img_c) self.num_actions = self.env.action_space.n if self.eval: # Model 1 with graph_1.as_default(): saver1 = tf.train.import_meta_graph( './models/Jamesbond_soft_q_ex2_e4.meta') saver1.restore(self.session1, './models/Jamesbond_soft_q_ex2_e4') self.obs_t_ph1 = tf.get_collection('obs_t_ph')[0] self.Temp1 = tf.get_collection('Temp')[0] self.keep_per1 = tf.get_collection('keep_per')[0] self.q_dist1 = tf.get_collection('q_dist')[0] self.q_t1 = tf.get_collection('q_t')[0] # Ex2 if self.ex2: self.ex2_in1_1 = tf.get_collection('ex2_in1')[0] self.ex2_in2_1 = tf.get_collection('ex2_in2')[0] self.ex2_dis_output1 = tf.get_collection( 'ex2_dis_output')[0] self.ex2_prob1 = tf.get_collection('ex2_prob')[0] # Model 2 with graph_2.as_default(): saver2 = tf.train.import_meta_graph( './models/Alien_soft_q_ex2_e4.meta') saver2.restore(self.session2, './models/Alien_soft_q_ex2_e4') self.obs_t_ph2 = tf.get_collection('obs_t_ph')[0] self.Temp2 = tf.get_collection('Temp')[0] self.keep_per2 = tf.get_collection('keep_per')[0] self.q_dist2 = tf.get_collection('q_dist')[0] self.q_t2 = tf.get_collection('q_t')[0] # Ex2 if self.ex2: self.ex2_in1_2 = tf.get_collection('ex2_in1')[0] self.ex2_in2_2 = tf.get_collection('ex2_in2')[0] self.ex2_dis_output2 = tf.get_collection( 'ex2_dis_output')[0] self.ex2_prob2 = tf.get_collection('ex2_prob')[0] self.model_initialized = True # print('obs is here',self.obs_t_ph) # print(self.Temp) # print(self.keep_per) # print(self.q_dist) print('restored and initialized the model') else: # set up placeholders # placeholder for current observation (or state) self.obs_t_ph = tf.placeholder(tf.float32 if lander else tf.uint8, [None] + list(input_shape)) # placeholder for current action self.act_t_ph = tf.placeholder(tf.int32, [None]) # placeholder for current reward self.rew_t_ph = tf.placeholder(tf.float32, [None]) # placeholder for next observation (or state) self.obs_tp1_ph = tf.placeholder( tf.float32 if lander else tf.uint8, [None] + list(input_shape)) # placeholder for end of episode mask # this value is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target, not the # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1) self.done_mask_ph = tf.placeholder(tf.float32, [None]) # casting to float on GPU ensures lower data transfer times. # TO-DO: WHY? if lander: obs_t_float = self.obs_t_ph obs_tp1_float = self.obs_tp1_ph else: obs_t_float = tf.cast(self.obs_t_ph, tf.float32) / 255.0 obs_tp1_float = tf.cast(self.obs_tp1_ph, tf.float32) / 255.0 # Here, you should fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # TensorFlow will differentiate this error for you, you just need to pass it to the # optimizer. See assignment text for details. # Your code should produce one scalar-valued tensor: total_error # This will be passed to the optimizer in the provided code below. # Your code should also produce two collections of variables: # q_func_vars # target_q_func_vars # These should hold all of the variables of the Q-function network and target network, # respectively. A convenient way to get these is to make use of TF's "scope" feature. # For example, you can create your Q-function network with the scope "q_func" like this: # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) # And then you can obtain the variables like this: # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES" # Tip: use huber_loss (from dqn_utils) instead of squared error when defining self.total_error ###### # YOUR CODE HERE # For bayesian exploration: Add dropout value to the network # Get Q-function and target network self.keep_per = tf.placeholder(shape=None, dtype=tf.float32) if self.explore == 'bayesian': print('Bayesian variables defined!') dropout = True else: dropout = False # EX2 if self.ex2: print('Use Exemplar Model') self.exemplar = Exemplar(input_dim=input_shape[0], seed=self.seed, eval=self.eval) q_t = q_func(obs_t_float, self.num_actions, scope='q_func', reuse=False, dropout=dropout, keep_prob=self.keep_per) q_tp1 = q_func(obs_tp1_float, self.num_actions, scope='target_q_func_vars', reuse=False, dropout=dropout, keep_prob=self.keep_per) # For boltzmann exploration if self.explore == 'soft_q': print('Boltzman variables defined!') self.Temp = tf.placeholder(shape=None, dtype=tf.float32) # print(q_t) #value = tf.reduce_mean(q_t, 1) # print(value) # print(q_t - value) # print(self.q_dist) # exit() # self.q_dist = tf.nn.softmax(q_t/self.Temp) # # Old version # value = tf.log( tf.reduce_sum(tf.exp(q_t),1) ) # self.q_dist = tf.exp(q_t - value) # New version self.q_dist = tf.nn.softmax(q_t / self.Temp) # Max operation self.q_t_action = tf.argmax(q_t, axis=1) # value = tf.reduce_mean(q_t) # self.q_t_action = tf.nn.softmax(q_t - value) # Specify double Q function difference if self.double_q: print('using double q learning') # TO-DO: VERY VERY IMPORTANT TO REUSE VAIRABLES # TO-DO: DO WE NEED TO SET GRADIENT NOT UPDATE q_tp1_target = q_func(obs_tp1_float, self.num_actions, scope='q_func', reuse=True) q_tp1_target_action = tf.argmax(q_tp1_target, axis=1) q_tp1_max = tf.reduce_sum( q_tp1 * tf.one_hot(indices=q_tp1_target_action, depth=self.num_actions, on_value=1.0, off_value=0.0), axis=1) else: # Soft maximum if self.explore == 'soft_q': print('using soft q learning') # q_tp1_max = tf.log( tf.reduce_sum(tf.exp(q_tp1),1) ) q_tp1_max = tf.reduce_logsumexp(q_tp1, 1) # print(q_tp1_max) # exit() else: q_tp1_max = tf.reduce_max(q_tp1, 1) # Get target value q_tp1 = gamma * (1.0 - self.done_mask_ph) * q_tp1_max target = self.rew_t_ph + q_tp1 # Get Q_fai(si,ai) # TO-DO: VERY VERY IMPORTANT! use reduce_sum instead of reduce_max since exist negative value q_t_target = tf.reduce_sum(q_t * tf.one_hot(indices=self.act_t_ph, depth=self.num_actions, on_value=1.0, off_value=0.0), axis=1) # Calculate loss self.total_error = target - q_t_target self.total_error = tf.reduce_mean(huber_loss(self.total_error)) # Produce collections of variables to update separately q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_func_vars') if 0: print(q_t.get_shape()) print(q_tp1.get_shape()) print(self.q_t_action.get_shape()) print(self.done_mask_ph.get_shape()) print(q_tp1_max.get_shape()) print(q_tp1.get_shape()) print(q_t_target.get_shape()) print(self.total_error.get_shape()) exit() ###### # construct optimization op (with gradient clipping) self.learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") optimizer = self.optimizer_spec.constructor( learning_rate=self.learning_rate, **self.optimizer_spec.kwargs) self.train_fn = minimize_and_clip(optimizer, self.total_error, var_list=q_func_vars, clip_val=grad_norm_clipping) # update_target_fn will be called periodically to copy Q network to target Q network update_target_fn = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_fn.append(var_target.assign(var)) self.update_target_fn = tf.group(*update_target_fn) # construct the replay buffer self.replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len, lander=lander) self.replay_buffer_idx = None ############### # RUN ENV # ############### if not self.eval: self.model_initialized = False self.num_param_updates = 0 self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') # last_obs intialized here self.last_obs = self.env.reset() self.log_every_n_steps = 10000 self.timesteps = [] self.mean_episode_rewards = [] self.best_mean_episode_rewards = [] self.start_time = None self.t = 0 # EX2 if not eval: self.saver = tf.train.Saver() tf.add_to_collection('obs_t_ph', self.obs_t_ph) tf.add_to_collection('Temp', self.Temp) tf.add_to_collection('keep_per', self.keep_per) tf.add_to_collection('q_dist', self.q_dist) tf.add_to_collection('q_t', q_t) if self.ex2: in1, in2, dis_output, prob = self.exemplar.model.predict_tensor( ) tf.add_to_collection('ex2_in1', in1) tf.add_to_collection('ex2_in2', in2) tf.add_to_collection('ex2_dis_output', dis_output) tf.add_to_collection('ex2_prob', prob) if self.ex2 and not self.eval: self.exemplar.model.init_tf_sess(self.session) self.model_initialized = True """ # eval if self.eval: print("Initialize Evaluation Mode") self.saver.restore(self.session, "./bstmodel/model.ckpt") self.model_initialized = True print("Initialized models") """ def stopping_criterion_met(self): return self.stopping_criterion is not None and self.stopping_criterion( self.env, self.t) def step_env(self): ### 2. Step the env and store the transition # At this point, "self.last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, self.last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! ## what do you mean?? what is context? ## i think it is just {s,a,r} ## check encode_recent_observation! # Note that you cannot use "self.last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) # Ex2 self.count += 1 # Store observation ret = self.replay_buffer.store_frame(self.last_obs) self.e_current_idx = ret # print(np.shape(self.last_obs)) # For exploration, the value will gradually decrease if self.explore == 'greedy': # print("using greedy exploration!") if (not self.model_initialized): action = np.random.randint(0, self.num_actions) else: recent_obs = self.replay_buffer.encode_recent_observation() action = self.session.run(self.q_t_action, feed_dict={ self.obs_t_ph: [recent_obs], self.keep_per: 1.0 }) action = action[0] if self.explore == 'e-greedy': # print("using e-greedy exploration!") # Random.random return [0,1) if (not self.model_initialized) or ( random.random() < self.exploration.value(self.t)): action = np.random.randint(0, self.num_actions) else: # Understanding: context have at least two frames to encode velocity info # RECENT_OBS: FOR RAM (128,) AND FOR LAUDER (9,) AND FOR ATARI (84,84,4) # Action shape (1,) # Encode recent observation recent_obs = self.replay_buffer.encode_recent_observation() # print(np.shape(recent_obs)) action = self.session.run(self.q_t_action, feed_dict={ self.obs_t_ph: [recent_obs], self.keep_per: 1.0 }) action = action[0] # print(np.shape(action)) # exit() if self.explore == 'soft_q': # print("using boltzmann exploration!") if (not self.model_initialized): action = np.random.randint(0, self.num_actions) else: recent_obs = self.replay_buffer.encode_recent_observation() #print(recent_obs.shape) #print(recent_obs) #print(self.q_dist) #print(self.obs_t_ph) #print(self.Temp) #print(self.keep_per) #exit() q_t1 = self.session1.run(self.q_t1, feed_dict={ self.obs_t_ph1: [recent_obs], self.Temp1: 1.0, self.keep_per1: 1.0 }) q_t2 = self.session2.run(self.q_t2, feed_dict={ self.obs_t_ph2: [recent_obs], self.Temp2: 1.0, self.keep_per2: 1.0 }) ex1_out, ex_prob1 = self.session1.run( [self.ex2_dis_output1, self.ex2_prob1], feed_dict={ self.ex2_in1_1: [recent_obs], self.ex2_in2_1: [recent_obs] }) ex2_out, ex_prob2 = self.session2.run( [self.ex2_dis_output2, self.ex2_prob2], feed_dict={ self.ex2_in1_2: [recent_obs], self.ex2_in2_2: [recent_obs] }) # print( "q_t1 shape", q_t1.shape) #print([ex_prob1, ex_prob2]) prob = np.clip([ex_prob1, ex_prob2], 0, 50) alphas = np_softmax(prob) # alphas = np.array([0.5, 0.5]) # print("alphas shape:",alphas.shape) # alphas = np.array([1.0, 0.0]) alphas = alphas[np.newaxis, :] # print("alpha:",alphas) q_t = np.concatenate((q_t1, q_t2)) # print("q_t shape:", q_t.shape) q_t = np.dot(alphas, q_t) # print("q_t final shape", q_t.shape) q_dist = np_softmax(q_t[0]) #print("q_t final shape", q_dist.shape) action = np.random.choice(self.num_actions, p=q_dist) # if self.eval and (self.replay_buffer.num_in_buffer > self.min_replay_size) and (self.count >= self.ex2_len): # self.count = 0 # #paths = self.replay_buffer.get_all_positive(self.ex2_len) # #ex2_out, ex2_pb = self.session.run([self.ex2_dis_output, self.ex2_prob], feed_dict={self.ex2_in1: paths, # # self.ex2_in2: paths}) # # print("ex2 dis_out", ex2_out) # # print("ex2 pb_out", ex2_pb) # #for _ in range(10): # # ex2_out, ex2_pb = self.session.run([self.ex2_dis_output, self.ex2_prob], feed_dict={self.ex2_in1: np.ones((1,9))/9, # # self.ex2_in2: np.ones((1,9))/9 }) # # print("ex2_pb", ex2_pb) # #exit() # if 0: # print('in',input_q) # print('qt',q_t) # print('qd',q_d) # action = np.random.choice(self.num_actions, p=q_d[0]) if self.explore == 'bayesian': # print("using bayesian exploration!") if (not self.model_initialized): action = np.random.randint(0, self.num_actions) else: recent_obs = self.replay_buffer.encode_recent_observation() keep_per = (1.0 - self.exploration.value(self.t)) + 0.1 # Deal with larger than 1.0 case keep_per = 1.0 if keep_per > 1.0 else keep_per # print(keep_per) action = self.session.run(self.q_t_action, feed_dict={ self.obs_t_ph: [recent_obs], self.keep_per: keep_per }) action = action[0] # print(action) # exit() # Step one step forward # INPUT FOR ACTION IS INT VALUE obs, reward, done, info = self.env.step(action) # print(np.shape(obs)) # exit() # Point to the newest observation if done: obs = self.env.reset() self.last_obs = obs # Store others self.replay_buffer.store_effect(ret, action, reward, done) # Update EX2 model and rewards if not self.eval: if self.ex2 and (self.replay_buffer.num_in_buffer > self.min_replay_size) and (self.count >= self.ex2_len): self.count = 0 # fit ex2 model if self.first_train: train_itrs = self.first_train_itrs self.first_train = False else: train_itrs = self.train_itrs for _ in range(train_itrs): positive = self.replay_buffer.sample_positive( self.ex2_len, 128) negative = self.replay_buffer.sample_negative( self.ex2_len, 128) # positive_np = np.asarray(positive) # print(positive_np.shape) # print(self.replay_buffer.num_in_buffer) # print(positive) # print(len(positive)) # exit() self.exemplar.fit(positive, negative) # update rewards paths = self.replay_buffer.get_all_positive(self.ex2_len) bonus_reward = self.exemplar.predict(paths) self.replay_buffer.update_reward(self.ex2_len, bonus_reward, self.coef) if self.eval: self.t += 1 # exit() ##### # YOUR CODE HERE def update_model(self): ### 3. Perform experience replay and train the network. # Absolutely, this process takes long! # note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (self.t > self.learning_starts and \ self.t % self.learning_freq == 0 and \ self.replay_buffer.can_sample(self.batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # batch_size = 32, observation shape = 128 obs_t_batch, act_batch, rew_batch, obs_tp1_batch, done_mask = self.replay_buffer.sample( self.batch_size) # 3.b: initialize the model if it has not been initialized yet; to do # that, call # initialize_interdependent_variables(self.session, tf.global_variables(), { # self.obs_t_ph: obs_t_batch, # self.obs_tp1_ph: obs_tp1_batch, # }) # where obs_t_batch and obs_tp1_batch are the batches of observations at # the current and next time step. The boolean variable model_initialized # indicates whether or not the model has been initialized. # Remember that you have to update the target network too (see 3.d)! # TO-DO: is it only initialize once when first start if not self.model_initialized: print("initializing model") if self.ex2: # initialized in Siamese model print("Ex2 no need to initialize") pass else: print("interdependent init") initialize_interdependent_variables( self.session, tf.global_variables(), { self.obs_t_ph: obs_t_batch, self.obs_tp1_ph: obs_tp1_batch, }) # self.session.run(tf.global_variables_initializer()) # TO-DO: VERY VERY IMPORTATNT!! #self.saver = tf.train.Saver() print("set model_initialized True") self.model_initialized = True # 3.c: train the model. To do this, you'll need to use the self.train_fn and # self.total_error ops that were created earlier: self.total_error is what you # created to compute the total Bellman error in a batch, and self.train_fn # will actually perform a gradient step and update the network parameters # to reduce total_error. When calling self.session.run on these you'll need to # populate the following placeholders: # self.obs_t_ph # self.act_t_ph # self.rew_t_ph # self.obs_tp1_ph # self.done_mask_ph # (this is needed for computing self.total_error) # self.learning_rate -- you can get this from self.optimizer_spec.lr_schedule.value(t) # (this is needed by the optimizer to choose the learning rate) # TO-DO: check written rule okay? _, error = self.session.run( [self.train_fn, self.total_error], feed_dict={ self.obs_t_ph: obs_t_batch, self.act_t_ph: act_batch, self.rew_t_ph: rew_batch, self.obs_tp1_ph: obs_tp1_batch, self.done_mask_ph: done_mask, self.learning_rate: self.optimizer_spec.lr_schedule.value(self.t), self.keep_per: 1.0 }) # print('error', error) # exit() # 3.d: periodically update the target network by calling # self.session.run(self.update_target_fn) # you should update every target_update_freq steps, and you may find the # variable self.num_param_updates useful for this (it was initialized to 0) ##### # YOUR CODE HERE self.num_param_updates += 1 if (self.num_param_updates % self.target_update_freq == 0): print("actually update") self.session.run(self.update_target_fn) # exit() self.t += 1 # print('self.t', self.t) def log_progress(self): #print(self.t) episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 50: if self.mean_episode_reward > self.best_mean_episode_reward: # store the best_mean_reward self.best_mean_episode_reward = self.mean_episode_reward #print("init?",self.model_initialized) #print("eval?",self.eval) if self.model_initialized and not self.eval: # store the best model save_path = self.saver.save(self.session, "./models/model") print("Model saved in path: %s" % save_path) #self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) #print("Exemplar test output") #for _ in range(10): # self.exemplar.predict(np.ones((1,9))/9 ) if self.t % self.log_every_n_steps == 0 and self.model_initialized: print("Timestep %d" % (self.t, )) print("mean reward (100 episodes) %f" % self.mean_episode_reward) print("best mean reward %f" % self.best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % self.exploration.value(self.t)) print("learning_rate %f" % self.optimizer_spec.lr_schedule.value(self.t)) if self.start_time is not None: print("running time %f" % ((time.time() - self.start_time) / 60.)) self.start_time = time.time() sys.stdout.flush() # Store variables self.timesteps.append(self.t) self.mean_episode_rewards.append(self.mean_episode_reward) self.best_mean_episode_rewards.append( self.best_mean_episode_reward) # TO-DO: it is weird, since every time it is doing dumpying, but every time it opens as new.. # Actually if less steps required, we can only store once at the end with open(self.rew_file, 'wb') as f: store_result = { 'timestep': np.array(self.timesteps), 'reward': np.array(episode_rewards), 'mean_reward': np.array(self.mean_episode_rewards), 'best_reward': np.array(self.best_mean_episode_rewards) } pickle.dump(store_result, f, pickle.HIGHEST_PROTOCOL)